def crawl_kicnews(crawler, out): urls = find_wordpress_urls(crawler, 'http://karen.kicnews.org/') urls = [u for u in urls if '%' in u] for url in sorted(urls): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) html = doc.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html).group(1) pubdate = re.search( r'<meta itemprop="datePublished" content="([^"]+)">', html) pubdate = cleantext(pubdate.group(1)) if pubdate else None try: text = html.split( '<div class="td-post-content td-pb-padding-side">', 1)[1].split('<div class="essb_links')[0] text = text.replace('\n', ' ').replace('</p>', '\n') except Exception as e: print('No content: %s' % url) continue paras = [cleantext(p) for p in [title] + text.splitlines()] paras = filter(None, paras) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_azattyk_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml') for url in sorted(sitemap.keys()): if not urlpath(url).startswith('/a/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search(r'"dateModified":"([^"]+)"', html) if pubdate is not None: pubdate = cleantext(pubdate.group(1)).replace(' ', 'T') title = extract('<title>', '</title>', html) text = extract('content-offset">', '</div>', html) if not title or not text: continue paras = [title] + re.sub(r'<br\s*?/?>', '\n', text).splitlines() paras = filter(None, [cleantext(p) for p in paras]) paras = [p for p in paras if not p.startswith('http')] if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_kwayedza(crawler, out): urls = find_wordpress_urls(crawler, site='http://www.kwayedza.co.zw/') for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = re.search(r'<title>(.+?)</title>', html) title = title.group(1) if title else None if html.find('itemprop="articleBody"') < 0: continue pubdate = re.search(r'datetime="(.+?)" itemprop="datePublished"', html) if pubdate: pubdate = cleantext(pubdate.group(1)) body = html.split('itemprop="articleBody"', 1)[1].split('>', 1)[1] body = body.split('<!-- .post-content -->')[0] body = body.split('<div class="post-share">')[0] body = body.replace('</p>', '\n').replace('</div>', '\n') paras = [title] + body.splitlines() paras = filter(None, [cleantext(p) for p in paras]) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_larenadomila_it(crawler): out = crawler.get_output(language='vec-u-sd-itvr') urls = find_urls_in_larenadomila_it( crawler, 'https://www.larenadomila.it/sito/index.php') for url in sorted(urls.difference(BLACKLISTED_URLS)): if url.find('&view=article&') < 0: continue doc = crawler.fetch(url) assert doc.status == 200, (doc.status, start_url) content = doc.content.decode('utf-8') title = cleantext(extract('<title>', '</title>', content)) sections = [title] + [c.strip() for c in content.splitlines()] sections = [c for c in sections if c.startswith('<div class="item_fulltext">') or c.startswith('<p><span class="grassetto">')] sections = [c.replace(' <br />- ', ' ') for c in sections] text = '<br/>'.join(sections) text = text.replace(' ', ' ') # used for spacing/formatting text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table)>', '\n', text) text = re.sub(r'<br\s*/?>', '\n', text) text = re.sub(r'\.{3,}', '… ', text) text = re.sub(r'\n(-)[^\s]', '- ', text) paras = filter(None, [cleantext(p) for p in text.split('\n')]) if not paras: continue out.write('# Location: %s\n' % url) out.write('\n'.join(paras) + '\n')
def crawl_irishtimes(crawler, out): start = 'https://www.irishtimes.com/culture/treibh' pubdatere1 = re.compile( r'<meta itemprop="datePublished" content="([^"]*)"/>') pubdatere2 = re.compile(r'"datePublished": "([^"])"') links = set() for contents in _irishtimes_section_list(crawler, out, start): init = crawler.fetch(contents) if init.status != 200: continue shtml = init.content.decode('utf-8') for doclink in re.findall('<p><a href="/culture/treibh/([^"]*)"', shtml): links.add('%s/%s' % (start, doclink)) for url in links: res = crawler.fetch(url) if res.status != 200: continue html = res.content.decode('utf-8') out.write('# Location: %s\n' % url) out.write('# Genre: News\n') title = re.search(r'<title>(.+?)</title>', html) pubdate_match = pubdatere1.search(html) pubdate_match = pubdate_match if pubdate_match else pubdatere2.search( html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) if title: out.write(cleantext(title.group(1)) + '\n') for paragraph in re.findall( r'<p class="no_name">(.+?)</p>', html.split('<div class="article_bodycopy">')[1]): cleaned = cleantext(paragraph) out.write(cleaned + '\n')
def crawl_loksatta_com(crawler, out): sitemap = crawler.fetch_sitemap('http://www.loksatta.com/sitemap.xml') for url in sorted(sitemap): doc = crawler.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue pubdate = re.search( r'<meta itemprop="datePublished" content="(.+?)"', html) pubdate = cleantext(pubdate.group(1)) if pubdate else None headline = extract('<h1 itemprop="headline" id="headline">', '</h1>', html) synopsis = extract('<h2 itemprop="description" class="synopsis">', '</h2>', html) text = extract('itemprop="articleBody">', '<div', html) if not text: continue text = text.replace('\n', ' ') text = re.sub(r'</?(?:br|BR|p|P)\s*?/?>', '\n', text) paras = [headline, synopsis] + text.splitlines() paras = filter(None, [cleantext(p) for p in paras]) if paras: out.write('# Location: %s\n# Genre: News\n' % url) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_nuachtrte(crawler, out): sitemap = crawler.fetch_sitemap( 'http://www.rte.ie/sitemap.xml', subsitemap_filter=lambda x: _check_rte_sitemap(x)) pubdate_regex = re.compile( r'name="DC.date" (?:scheme="DCTERMS.URI" )?content="([0-9T:+\-]{19,25})"' ) for url in sorted(sitemap.keys()): if not _rtenuacht_path(url): continue fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[url] out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) title = re.search(r'<title>(.+?)</title>', html) if title: title = striptags(title.group(1).split('- RTÉ')[0]).strip() if title: out.write(cleantext(title) + '\n') for paragraph in re.findall(r'<p>(.+?)</p>', html): cleaned = cleantext(paragraph) if _rte_writable_paragraph(cleaned): out.write(cleaned + '\n') else: continue
def crawl_azattyk_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml') for url in sorted(sitemap.keys()): if not urlpath(url).startswith('/a/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search(r'"dateModified":"([^"]+)"', html) if pubdate is not None: pubdate = cleantext(pubdate.group(1)).replace(' ', 'T') title = extract('<title>', '</title>', html) text = extract('content-offset">', '<footer', html) if not title or not text: continue text = text.split('<span class="share')[0] text = text.split('<div class="region"')[0] text = text.replace('\n', ' ') paras = [title] + re.sub(r'<(?:br|p|div)\s*?/?>', '\n', text).splitlines() paras = filter(None, [cleantext(p.strip()) for p in paras]) paras = [p for p in paras if not p.startswith('http')] if not paras: continue # Filter out English text. if ord(paras[0][0]) <= 0xFF or ord(paras[-1][-1]) <= 0xFF: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_shannews(crawler, out): urls = find_wordpress_urls(crawler, 'https://shannews.org/archives/', allow_404=True) urls = [ u for u in urls if re.match(r'^https://shannews.org/archives/\d+$', u) ] for url in sorted(urls): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) html = doc.content.decode('utf-8') title = re.search(r'<h1 class="entry-title">(.+?)</h1>', html).group(1) pubdate = re.search( r'<meta itemprop="datePublished" content="([^"]+)">', html) pubdate = cleantext(pubdate.group(1)) if pubdate else None try: text = html.split('<div class="td-post-content">', 1)[1] \ .split('<div id="fb-root">')[1] \ .split("<div class='heateorFfcClear'>")[0] \ .replace('\n', ' ').replace('</p>', '\n') except Exception as e: print('No content: %s' % url) continue paras = [cleantext(p) for p in [title] + text.splitlines()] paras = filter(None, paras) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_wantokniuspepa_com(crawler, out): sections = { 'abc-pasifik-nius', 'bisnis-nius', 'helt-nius', 'komentri', 'laip-stail', 'meri-nius', 'nius', 'wantok' } seeds = set() for section in sorted(sections): section_url = 'http://wantokniuspepa.com/index.php/%s' % section seeds.add(section_url) section_index = crawler.fetch(section_url) assert section_index.status == 200, (section_index.status, section_url) last_page = re.search('"End" href=".+?start=(\d+)" class="pagenav"', section_index.content.decode('utf-8')) if last_page is not None: for page in range(1, int(last_page.group(1)) + 1): seeds.add('http://wantokniuspepa.com/index.php/%s?start=%d' % (section, page)) urls = set() for seed in sorted(seeds): doc = crawler.fetch(seed) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') for u in re.findall(r'(/index\.php/[^"]+?)"', content): p = u.split('/') if len(p) > 3 and p[1] == 'index.php' and p[2] in sections: if re.search(r'/\d{4,}', u) is not None: urls.add('http://wantokniuspepa.com' + u.split('?')[0]) for url in sorted(urls): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') title = extract('<title>', '</title>', content) pubdate = re.search( r'<time datetime="([^T]+?)T([^"]+?)" ' 'itemprop="datePublished">', content) pubdate = cleantext(pubdate.group(1)) if pubdate else None body = extract('<div itemprop="articleBody">', '<ul class="pager', content) if not body: continue body = body.split('<div class="clearfix"')[0] text = body.replace('\n', ' ') text = text.replace(' ,', ',').replace('“ ', '“') text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text) text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text) paras = [cleantext(p) for p in [title] + text.splitlines()] paras = filter(None, paras) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_titus_avestan(crawler, out, out_latin): for page in range(1, 249): url = ('http://titus.uni-frankfurt.de/texte/etcs/iran/airan/avesta/' + 'avest%03d.htm' % page) doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) html = doc.content.decode('utf-8') for chapter_id, chapter in enumerate( re.split('<span id=(?:h3|subtitle)>', html)[1:]): chapter = chapter.replace('<SUP>\u030A</SUP>', '\u030A') chapter = chapter.replace('<SUP>v</SUP>', '\u1D5B') chapter = chapter.replace('β', '\uA7B5') # LATIN SMALL LETTER BETA chapter = chapter.replace('δ', 'ẟ') # LATIN SMALL LETTER DELTA title = re.search(r'<a id=subtitle[^>]*>(.+?)</a>', chapter) text = [title.group(1) if title else ''] for paragraph in chapter.split('Paragraph')[1:]: cur_paragraph = [] for verse in paragraph.split('Verse')[1:]: verse = cleantext(verse.split('>', 1)[1]) verse = verse.split('This')[0] verse = re.sub(r'(\s*:+\s*)', lambda m: ' ' + m.group(1).strip() + ' ', verse) verse = re.sub('\.{2,}', '…', verse) for c in '+*^': verse = verse.replace(c, ' ') verse = re.sub(r'[\s\.\d]+\)[\s\.]', ') ', verse) verse = re.sub(r'[\s\.\d]+\]\.*', '] ', verse) verse = re.sub(r'\{[^}]+\}', ' ', verse) verse = re.sub(r'\(~[^)]+\)', ' ', verse) verse = re.sub(r'[\s\.\d]*(:+)[\s\.\d]*', lambda m: m.group(1) + ' ', verse) words = [w.strip('0123456789.') for w in verse.split()] verse = cleantext(' '.join(words)).lower() verse = verse.replace(': :', '::') cur_paragraph.append(verse) p = ' '.join(cur_paragraph) p = re.sub(r'[^:]::[^:]', '. ', p) p = re.sub(r'[^:]::$', '. ', p) + ' ' sentences = [] for s in p.split('. '): if len(s) > 1: s = ' '.join(s.split()) sentences.append(s[0].title() + s[1:] + '. ') p = '. '.join(sentences).strip() p = p.replace('. .', '.') text.append(unicodedata.normalize('NFC', p)) paras = filter(None, text) out.write('# Location: %s#%d\n' % (url, chapter_id + 1)) out_latin.write('# Location: %s#%d\n' % (url, chapter_id + 1)) out_latin.write('\n'.join(paras) + '\n') out.write(untransliterate('\n'.join(paras)) + '\n')
def _crawl_kauno_diena_lt(crawler, out): urls = {} for i in range(1, 6): url = 'http://kauno.diena.lt/sitemap/kd/sitemap%d.xml' % i urls.update(crawler.fetch_sitemap(url)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue try: html = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = extract('<h1 class="title" id="page-title">', '</h1>', html) title = cleantext(title if title else '') body = extract("<span itemprop='articleBody'>", '</div>', html) or '' paras = [] for p in clean_paragraphs('%s<br/>%s' % (title, body)): if 'MicrosoftInternetExplorer4' in p: break paras.append(p) pubdate = re.search( r'<span\s+property="dc:date\s+dc:created"\s+content="(20[^"]+)"', html) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_areena_yle_fi(crawler, out): for offset in range(0, 3000, 10): url = ('https://areena.yle.fi/api/programs/v1/items.json?' 'series=1-1931339&type=program&availability=ondemand&' 'order=episode.hash%3Adesc%2Cpublication.starttime%3Adesc%2C' 'title.fi%3Aasc&app_id=89868a18&' 'app_key=54bb4ea4d92854a2a45e98f961f0d7da&' 'limit=10&offset=' + str(offset)) doc = crawler.fetch(url) if doc.status != 200: continue content = json.loads(doc.content) data = content.get('data') if not data: return for item in data: title = item.get('itemTitle', {}).get('fi') description = item.get('description', {}).get('fi', '') paras = filter(None, [title] + description.splitlines()) paras = filter(None, [cleantext(p) for p in paras]) paras = [ p for p in paras if not (p.startswith('(') or p.startswith('Nuntii Latini')) ] publications = item.get('publicationEvent', []) pubdates = filter(None, [e.get('startTime') for e in publications]) pubdate = min(pubdates) if pubdates else None if paras: out.write('# Location: %s\n' % item['@id']) out.write('# Genre: News\n') out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_azg_am(crawler, out): urls = set() for d in daterange(date(2001, 1, 9), date.today()): datestr = '%04d%02d%02d00' % (d.year, d.month, d.day) url = 'http://www.azg.am/AM/%s' % datestr doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') articles = [ a for a in re.findall(r'20\d{8}', content) if not a.endswith('00') ] for a in articles: urls.add('http://www.azg.am/wap/?nl=AM&id=%s&Base_PUB=0' % a) print(len(urls)) for url in sorted(urls): pubdate = re.search(r'id=(20\d{6})', url).group(1) doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') text = extract('<hr>', '<hr>', content) text = text.replace('\n', ' ') text = re.sub('</(p|h[1-9]|div)>', '\n', text) paras = filter(None, [cleantext(p) for p in text.splitlines()]) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Publication-Date: %s-%s-%s\n' % (pubdate[:4], pubdate[4:6], pubdate[6:8])) out.write('\n'.join(paras) + '\n')
def crawl_chg(crawler, out): def _chg_content(page): return page.split('<div class="container" id="article">')[1].split( '<!-- /.right columns -->')[0] sitemap = 'https://www.chg.gov.ie/ga/help/sitemap/' res = crawler.fetch(sitemap) if res.status != 200: return links = set() html = res.content.decode('utf-8') body = _chg_content(html) for pagelink in re.findall('<a href="([^"]*)">', body): if pagelink.startswith('https://www.chg.gov.ie/ga/'): links.add(pagelink) for link in links: pres = crawler.fetch(link) if pres.status != 200: continue phtml = pres.content.decode('utf-8') ptext = _chg_content(phtml) title = re.search(r'<title>(.+?)</title>', phtml) if title: title = striptags(title.group(1).split('|')[0]).strip() pubdate = pres.headers.get('Last-Modified') out.write('# Location: %s\n' % link) out.write('# Genre: Government\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for paragraph in re.findall(r'<p>(.+?)</p>', ptext): cleaned = cleantext(paragraph) out.write(cleaned + '\n')
def crawl_quatrociacoe_it(crawler): out = crawler.get_output(language='vec-u-sd-itpd') urls = set() main = crawler.fetch('http://www.quatrociacoe.it/') assert main.status == 200, main.status for e in re.findall(r'href="/(\d{6})/\d{6}\.php"', main.content): ed = crawler.fetch('http://www.quatrociacoe.it/%s/%s.php' % (e, e)) assert ed.status == 200, ed.status for path in re.findall(r'href="(/%s/.+?\.php)"' % e, ed.content): if path != '/%s/%s.php' % (e, e): urls.add('http://www.quatrociacoe.it' + path) for url in sorted(urls): if url in BLACKLISTED_URLS: continue doc = crawler.fetch(url) assert doc.status == 200, doc.status encoding = re.search(r'html;\s*charset=([\-a-zA-Z0-9]+)"', doc.content) encoding = encoding.group(1).lower() if encoding else 'utf-8' assert encoding in ('iso-8859-1', 'utf-8'), (encoding, url) content = doc.content.decode(encoding) text = extract('<!-- *** INIZIO ARTICOLO ***-->', '<!-- *** FINE ARTICOLO ***-->', content) if not text: continue year, month = re.search(r'/(20\d{2})(\d{2})/', url).groups() text = text.replace('\n', ' ').replace('\r', ' ') text = re.sub('Torna\s+alla pagina principale', ' ', text) text = text.replace('[torna sopra]', ' ') text = re.sub(r'<!--.+?-->', '', text, flags=re.DOTALL) text = re.sub(r' alt="[^"]+"', ' ', text, flags=re.DOTALL) text = text.replace('\u0091', '’') # misuse of U+0091 PRIVATE USE ONE text = text.replace('\u0092', '’') # misuse of U+0092 PRIVATE USE TWO text = text.replace('<<', '«').replace('>>', '»') # invalid HTML text = text.replace('<<', '«').replace('>>', '»') text = re.sub('\.{3,}', '…', text) text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text) text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text) paras = filter(None, [cleantext(p) for p in text.splitlines()]) text = re.sub(r'<img.+?\n">', ' ', '\n'.join(paras)) paras = filter(None, [cleantext(p) for p in text.splitlines()]) out.write('# Location: %s\n' % url) out.write('# Publication-Date: %s-%s-01\n' % (year, month)) out.write('# Genre: Fiction\n') out.write('\n'.join(paras) + '\n')
def crawl_newsbook_mt(crawler, out): urls = set() for section in ('internazzjonali', 'muzika', 'madwar-il-hajja', 'teknologijja', 'vatikan', 'sports', 'kummerc'): section_url = 'http://www.newsbook.com.mt/artikli/%s/' % section html = crawler.fetch(section_url).content.decode('utf-8') links = re.findall(r'/artikli/%s/(\d+)/' % section, html) num_toc_pages = max([int(x) for x in links]) for i in range(1, num_toc_pages + 1): toc_url = section_url if i > 1: toc_url = toc_url + '%d/' % i html = crawler.fetch(toc_url).content.decode('utf-8') for u in re.findall('href="(/artikli/\d{4}/.+?)"', html): url = urljoin(toc_url, u) if url.find('/test') < 0: urls.add(url) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search(r'<meta content="([^"]+?)" name="title"', html) if title is not None: title = cleantext(title.group(1)) pubdate = re.search( r'<meta content="([^"]+?)" itemprop="datePublished"', html) if pubdate is not None: pubdate = pubdate.group(1).strip().replace(' ', 'T') + 'Z' content = html.split('<p>', 1)[1].split('<div', 1)[0] content = content.replace('\n', ' ').replace('</p>', '\n') paras = [ fixquotes(cleantext(p)) for p in [title] + content.splitlines() ] paras = filter(None, paras) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_dimma_fo(crawler, out): num_pages = int(re.search( r'<a href="http://www.dimma.fo/(\d+)" class="to-last"', crawler.fetch('http://www.dimma.fo/').content).group(1)) urls = set() for i in range(1, num_pages + 1): doc = crawler.fetch('http://www.dimma.fo/%d' % i) html = doc.content.decode('utf-8') for u in re.findall(r'href="(http://www.dimma.fo/[^"]+?)"', html): path = urlpath(u) if re.match(r'/\d+', path) or u'/' in path[1:]: continue urls.add(u) for url in sorted(urls): doc = crawler.fetch(urlencode(url)) if doc.status != 200: continue html = doc.content.decode('utf-8') content = html.split('class="content">')[1] pubdate = re.search( r'<span class="date">\s*' r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*</span>', content) if pubdate != None: pubdate = '%sT%s:00+01:00' % (pubdate.group(1), pubdate.group(2)) paragraphs = [] title = re.search(r'<h1>(.+?)</h1>', html, flags=re.DOTALL) if title != None: paragraphs.append(cleantext(title.group(1))) text = content.split('<p>', 1)[1].split('</div>')[0] text = text.replace('\n', ' ').replace('</p>', '\n') text = text.replace('<br />', '\n') paragraphs.extend([cleantext(p) for p in text.splitlines()]) paragraphs = filter(None, paragraphs) if paragraphs: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paragraphs: out.write(p + '\n')
def crawl_pl_usembassy_gov(crawler, out): sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml') trans_regex = re.compile( r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"' ) pubdate_regex = re.compile( r'<meta property="article:published_time" content="([^"]*)"' ) links = set() for key in sorted(sitemap.keys()): if _pl_usembassy_gov_path(key): links.add(key) for link in sorted(links): result = crawler.fetch(link) if result.status != 200: continue html = result.content.decode('utf-8') title = extract('<title>', '</title>', html) title = title if title else '' title = title.split(' | ')[0] if ' | ' in title else title pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None trans_match = trans_regex.search(html) trans = trans_match.group(1) if trans_match else None if pubdate is None: pubdate = result.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[link] exstart = '<div class="entry-content">' exstart2 = '<div class="mo-page-content">' exend = '<!-- AddThis Advanced Settings above via filter on the_content -->' exstart = exstart2 if exstart2 in html else exstart content = extract(exstart, exend, html) cleanparas = clean_paragraphs(content) if content else None # Don't repeat the title if it's the only text content cleantitle = cleantext(title) if cleanparas: if len(cleanparas) == 1 and cleanparas[0] == cleantitle: paras = [cleantitle] else: paras = [cleantitle] + cleanparas else: paras = [cleantitle] # There are quite a few media pages whose only text is the filename # this, conveniently, is typically also the post's name if len(paras) == 1 and paras[0].lower() in urlpath(link).lower(): continue if paras: out.write('# Location: %s\n' % link) out.write('# Genre: Diplomatic\n') if trans: out.write('# Translation: %s\n' % trans) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_kuensel(crawler, out): urls = find_wordpress_urls(crawler, 'http://www.dzkuensel.com/') urls = [u for u in urls if '%' in u] for url in sorted(urls): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) html = doc.content.decode('utf-8') title = re.search(r'<h1[^>]*>(.+?)</h1>', html).group(1) pubdate = re.search(r'"datePublished":"(.+?)"', html) pubdate = cleantext(pubdate.group(1)) if pubdate else None text = html.split('<div class="entry">', 1)[1].split('<!-- .entry ')[0] text = text.replace('\n', ' ').replace('</p>', '\n') paras = [cleantext(p) for p in [title] + text.splitlines()] paras = filter(None, paras) if any(p.startswith('Search for') for p in paras): continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _scrape_paiperatapu(crawler, out): booklist = list() books = crawler.fetch( 'http://www.paiperatapu.maori.nz/paipera-tapu-online') assert books.status == 200, books.status bookshtml = books.content.decode('utf-8') bookshtmlinner = bookshtml.split('<div class="bible-book-list">')[1].split( '<li class="first bible-search">')[0] for bookslink in re.findall(r'<a href="(/bible/[0-9]*/[^"]*)">', bookshtmlinner): bookurl = 'http://www.paiperatapu.maori.nz' + bookslink book = crawler.fetch(bookurl) assert book.status == 200, book.status bookhtml = book.content.decode('utf-8') bookhtmlinner = bookhtml.split('<ul class="bible-chapter-list">')[ 1].split('<div class="bible-links">')[0] for chapterlink in re.findall( r'<a href="(/bible/[0-9]*/[^/]*/[^"]*)">', bookhtmlinner): url = 'http://www.paiperatapu.maori.nz' + chapterlink chapter = crawler.fetch(url) assert chapter.status == 200, chapter.status chapterhtml = chapter.content.decode('utf-8') if '<dl class="bible-chapter-content">' not in chapterhtml: continue out.write('# Location: %s\n' % url) title = re.search(r'<title>(.+?)</title>', chapterhtml) if title: title = striptags(title.group(1).split('| Te')[0]).strip() # Title is in English if title: out.write('# Title: %s\n' % cleantext(title)) out.write('# Genre: Religion\n') chapterhtmlinner = chapterhtml.split( '<dl class="bible-chapter-content">')[1].split( '<div class="bible-chapter-seek">')[0] for verse in re.finditer( r'<dt><a name="[^"]*"></a>([^<]*)</dt><dd class="[^"]*">([^<]*)</dd>', chapterhtmlinner): out.write('%s %s\n' % (verse.group(1), cleantext(verse.group(2))))
def crawl_naij(crawler, out): urls = crawler.fetch_sitemap( 'https://hausa.naij.com/naij/sitemap/hausa/sitemap.xml').keys() urls = sorted([u for u in urls if u.find('hausa') > 0]) for url in urls: doc = crawler.fetch(url).content.decode('utf-8') doc = re.sub(r'<script>.+?</script>', '', doc, flags=re.DOTALL) pubdate = re.search(r'<meta itemprop="datePublished" content="(.+?)"', doc).group(1) title = cleantext( re.search(r'<h1.*?>(.+?)</h1>', doc, re.DOTALL).group(1)) article = '<article' + doc.split('<article')[1].split('<p>Source:')[0] paragraphs = [title] for text in article.split('</p>'): text = cleantext(text) if text: paragraphs.append(text) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') out.write('# Publication-Date: %s\n' % pubdate) for p in paragraphs: out.write(p + '\n')
def _rte_cleanall(html): section_article_regex = re.compile( r'<section[^>]+itemprop="articleBody"[^>]*>') search = section_article_regex.search(html) out = [] if search: body = extract(search.group(0), '</section>', html) for para in clean_paragraphs(body): if _rte_writable_paragraph(para): out.append(para) return '\n'.join(out) for paragraph in re.findall(r'<p>(.+?)</p>', html): cleaned = cleantext(paragraph) if _rte_writable_paragraph(cleaned): out.append(cleaned) else: continue return '\n'.join(out)
def crawl_coislife_ie(crawler, out): links = set() for num in range(1, 12): if num > 1: listurl = 'https://www.coislife.ie/product-category/ga/page/%s/' % num else: listurl = 'https://www.coislife.ie/product-category/ga/' idxres = crawler.fetch(listurl) if idxres.status != 200: continue idxhtml = idxres.content.decode('utf-8') index = extract('<div class="products-archive--products">', '<nav class="woocommerce-pagination">', idxhtml) for link in re.findall( r'<a href="(https://www.coislife.ie/product/[^"]+?)">', index): links.add(link) for url in sorted(links): fetchresult = crawler.fetch(url) if fetchresult.status != 200: continue html = fetchresult.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html) title = title.group(1).split('–')[0].strip() if title else '' desc = re.search(r'<meta property="og:description" content="([^"]+?)"', html) desc = cleantext(desc.group(1)) body = extract( '<div class="tab-content">', '<div class="entry-content in fade tab-pane" id="tab-additional_information">', html) or '' paras = clean_paragraphs(title + '<br/>' + body) pubdate = fetchresult.headers.get('Last-Modified') if paras: out.write('# Location: %s\n' % url) out.write('# Genre: Commerce\n') if desc: out.write('# Description: %s\n' % desc) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for para in paras: if para.find('Léigh sliocht as an leabhar') >= 0: continue else: out.write(para + '\n')
def crawl_jagbani_punjabkesari_in(crawler, out): urls = set() main = crawler.fetch('http://jagbani.punjabkesari.in/') assert main.status == 200, main.status menu = extract('<nav id="menu" class="menu">', '</nav>', main.content.decode('utf-8')) urls_re = re.compile(r'href="(https?://jagbani\.punjabkesari\.in/[^"]+?)"') category_urls = urls_re.findall(menu) for category_url in sorted(set([x.strip() for x in category_urls])): for page in range(1, 1000): doc = crawler.fetch(category_url + '/page/%d' % page) content = doc.content.decode('utf-8') if doc.status == 200 else '' if content.find('class="story"') < 0: break for u in urls_re.findall( extract('<span class="story">', '<div class="kjpage"', content)): urls.add(urlencode(u.strip())) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue try: content = doc.content.decode('utf-8') except UnicodeDecodeError: continue title = extract('<title>', '</title>', content) text = extract('<article>', '</article>', content) if not text: continue text = re.sub(r'<br[^a-zA-Z][^>]*>', '<br>', text) text = text.replace('\n', ' ').replace('<br>', '\n') paras = [title] + text.splitlines() paras = filter(None, [cleantext(p) for p in paras]) pubdate = re.search( '<meta property="article:published_time" content="([^"]+?)"', content) pubdate = pubdate.group(1) if pubdate else None if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write(('\n'.join(paras) + '\n'))
def _crawl_osagelanguagetools(crawler, out): for database in ('578', 'Approved+Words'): url = ('http://osagelanguagetools.appspot.com/words/getPhrases/' '?filterStatus=&databases=%s&sortCriteria=index' % database) html = crawler.fetch_content(url) out.write('# Location: %s\n' % url) out.write('# Genre: Dictionary\n') out.write('# Publication-Date: 2017\n') for row in re.findall(r'<tr.+?</tr>', html, flags=re.DOTALL): row = row.replace('\n', ' ') text = re.search('<td class="unicodeOsageText.+?>(.+?)</td>', row) if not text: continue text = text.group(1) text = text.replace('(Myrtle)', ' ').replace('(Mogri)', ' ') text = cleantext(text) if text.startswith('Teach'): continue out.write('%s\n' % text)
def crawl_wikisource_trieste_vernacola(crawler): out = crawler.get_output(language='vec-u-sd-itts') urls = set() index = crawler.fetch( 'https://vec.wikisource.org/wiki/Indice:Trieste_vernacola.djvu') assert index.status == 200, index.status remarks = extract('<div id="remarks">', 'Colombe</a>', index.content.decode('utf-8')) for urlpath in sorted(set(re.findall(r'href="(/wiki/[^"]+)"', remarks))): if not urlpath.startswith('/wiki/Trieste_vernacola/'): urls.add('https://vec.wikisource.org' + urlpath) for url in sorted(urls.difference(BLACKLISTED_URLS)): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) content = doc.content.decode('utf-8') text = extract('<div id="scatola" class="testo">', '<noscript>', content) text = text.split('<dt>Note</dt>')[0].split('<dl>')[0] text = text.replace('\n', ' ') text = re.sub(r'<sup.+?</sup>', '', text) text = text.replace(' ', ' ') # NBSP used for spacing text = text.replace("'", "’") text = re.sub(r'<!--.+?-->', '', text, flags=re.DOTALL) text = re.sub(r' alt="[^"]+"', ' ', text, flags=re.DOTALL) text = re.sub(r'<span class="numeroriga".+?</span>', '', text) text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text) text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text) lines = [l for l in text.splitlines() if l.find('noprint') < 0 and l.find('font-size:smaller') < 0] text = '\n'.join([cleantext(l) for l in lines]) text = re.sub('\n{2,}', '<p>', text).replace('\n', ' | ') text = text.replace('<p>', '\n') paras = filter(None, [' '.join(p.split()) for p in text.splitlines()]) if not paras: continue # The book, published in 1920, is a collection of earlier lyrics. pubyear = re.search(r'<span id="ws-year">(\d{4})</span>', content) pubyear = int(pubyear.group(1)) if pubyear else 1920 out.write('# Location: %s\n' % url) out.write('# Genre: Lyrics\n') out.write('# Publication-Date: %d\n' % pubyear) out.write('\n'.join(paras) + '\n')
def crawl_tamurt(crawler, out): for url in sorted(find_tamurt_urls(crawler)): doc = crawler.fetch(url) assert doc.status == 200, (doc.status, url) html = doc.content.decode('utf-8') title = re.search(r'<title>(.+?)</title>', html).group(1) title = title.rstrip(' - Tamurt') pubdate = re.search( r'<meta property="article:published_time" content="([^"]+)"', html) pubdate = pubdate.group(1).strip() content = '<div ' + html.split('<div class="entry-content"', 1)[1] content = content.split('<!-- .entry-content -->')[0] content = re.sub(r'<!--.+?-->', '', content) paras = [title] + content.replace('</p>', '\n').splitlines() paras = filter(None, [cleantext(p) for p in paras]) out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_nupepa_org(crawler, out): urls = set() for i in range(1, 104): url = ('http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=p-0nupepa--' '00-0-0--010---4-----text---0-1l--1en-Zz-1---20-about---' '0003-1-0000utfZz-8-00&a=d&cl=CL2.' + str(i)) doc = crawler.fetch(url) assert doc.status == 200, url content = doc.content.decode('utf-8') for u in re.findall(r'href="(/gsdl2.5/cgi-bin/nupepa[^"]+)"', content): if u.endswith('gg=text'): urls.add('http://nupepa.org' + replace_html_entities(u)) for url in sorted(urls): doc = crawler.fetch(url) assert doc.status == 200, url content = doc.content.decode('utf-8') if content.find('Document contains no data') >= 0: continue pubdate = re.search(r'tif_([0-9]{4})([01][0-9])([0123][0-9])\.tif"', content) pubdate = '%s-%s-%s' % (pubdate.group(1), pubdate.group(2), pubdate.group(3)) if pubdate else None paras = [] while True: text = extract( "<p class=MsoNormal style='text-autospace:none'><span style='font-size:10.0pt'>", "</table>", content) if not text: break text = text.replace('\n', ' ').replace('<br>', '\n') text = replace_html_entities(text.replace(' ', ' ')) paras.extend([cleantext(p) for p in text.splitlines()]) nexturl = re.search(r'<a href="([^"]+)">next page', content) if nexturl is None: break nexturl = 'http://nupepa.org' + replace_html_entities(nexturl.group(1)) doc = crawler.fetch(nexturl) assert doc.status == 200, (doc.status, nexturl) content = doc.content.decode('utf-8') text = '\n'.join(filter(None, paras)) text = re.sub( r'DEATH OF MR\. DOUGLAS.+?has not been heard of since\.', '', text, flags=re.DOTALL) text = re.sub( r'IV\. "Their Majesties do further agree.+?by the parties\.', '', text, flags=re.DOTALL) text = re.sub( r'1 Oh, come, come away, from labor now reposing,.+?' r'Honolulu, Nov\. 25, 1861\. J\. L\. N\.\*', '', text, flags=re.DOTALL) text = re.sub( r'Died at sea, August 14.+?after a passage of about a month\.', '', text, flags=re.DOTALL) text = re.sub( r'On the 26th ult\. the Rev\. J.+?best wishes to you all\."', '', text, flags=re.DOTALL) text = re.sub( r'The subscriber avails himself.+?agreeable circumstances\.', '', text, flags=re.DOTALL) text = re.sub( r'NOTICE\. The publishing of.+for want of paper\.', '', text, flags=re.DOTALL) text = re.sub( r'ARRIVALS AT OAHU, SANDWICH ISLANDS,.+Sold here to the Government\.', '', text, flags=re.DOTALL) text = re.sub( r'NOTICE\. NOTICE is hereby given,.+by the subscriber\.', '', text, flags=re.DOTALL) text = re.sub( r'Articles made and agreed.+?upon the Sandwich Islands\.', '', text, flags=re.DOTALL) text = re.sub( r'MRS\. MARIA M\. DIBBLE\. Died at Lahainaluna.+?SHELDON DIBBLE\.', '', text, flags=re.DOTALL) text = re.sub( r'DEATH OF MRS\. BETSEY C\. LYONS.+?the son of man cometh\.\"', '', text, flags=re.DOTALL) text = re.sub( r'CARD\. The Missionary Company.+?April 20th 1837\.', '', text, flags=re.DOTALL) text = re.sub( r'DISTRESS OF THE WHALE SHIP GEORGE.+?who is now master of her\.', '', text, flags=re.DOTALL) text = re.sub( r'KNOW ALL MEN, That according.+?especially those above re-', '', text, flags=re.DOTALL) text = re.sub( r'cited, of the said Commissioners.+?and acknowledge the Protest', '', text, flags=re.DOTALL) text = re.sub( r'and withdrawal of our Deputy as our own.+?in the dominions of the Queen of', '', text, flags=re.DOTALL) text = re.sub( r'Taheite that I have received instructions.+?Commodore\. \[Official Copy\]', '', text, flags=re.DOTALL) text = re.sub( r'TO HIS MAJ\. KAMEHAMEHA.+?Naval Force in the E\. Indies\.', '', text, flags=re.DOTALL) text = re.sub( r'To the House of Representatives of the United States.+?' r'the arts of civilized life\.', '', text, flags=re.DOTALL) text = re.sub( r'It cannot but be in conformity.+?right to complain\.', '', text, flags=re.DOTALL) text = re.sub( r'The Committee on Foreign Affairs, to whom was.+?peace and love\.', '', text, flags=re.DOTALL) text = re.sub( r'WASHINGTON, June 25th, 1843.+?treat upon all occassions, the', '', text, flags=re.DOTALL) text = re.sub( r'native rulers of the Sandwich.+?P\. Upshur, &c\. &c\.', '', text, flags=re.DOTALL) if text.startswith('TERMS. One copy'): # Article entirely in English. continue paras = filter(None, [cleantext(p) for p in text.splitlines()]) if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _scrape_maoritelevision(crawler, out): articlelist = set() articlelist.add('http://www.maoritelevision.com/mi/purongo/purongo-hou') articlelist.add('http://www.maoritelevision.com/mi/purongo/hakinakina') for i in range(1, 101): articlelist.add( 'http://www.maoritelevision.com/mi/purongo/purongo-hou?page=%d' % i) articlelist.add( 'http://www.maoritelevision.com/mi/purongo/hakinakina?page=%d' % i) links = set() pubdate_regex = re.compile(r'<time datetime="([0-9T:+\-]{25})"') for url in articlelist: doc = crawler.fetch(url) if doc.status != 200: continue content = doc.content.decode('utf-8') for articlepiece in content.split('<article')[1:]: for artlink in re.findall('<a href="(/mi/purongo/[^"]*)"', articlepiece): if not artlink.startswith('/mi/purongo/purongo-hou'): links.add('http://www.maoritelevision.com%s' % artlink) for url in links: doc = crawler.fetch(url) if doc.status != 200: continue if 'a-motu/rereatea-midday-news' in url: continue html = doc.content.decode('utf-8') if 'lang="mi"' not in html: continue if 'itemprop="articleBody"' not in html: continue genre = 'Sport' if '/hakinakina/' in url else 'News' pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None if pubdate is None: pubdate = doc.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[url] # These news stories are a parallel (or at least comparable) corpus, so keeping # the link to the English article english = re.search( r'<a href="(/news/[^"]*)" class="language-link" lang="en">', html) if english: english = 'http://www.maoritelevision.com%s' % english.group(1) tags = set() if '<ul class="tags">' in html: tagshtml = html.split('<ul class="tags">')[1].split('</ul>')[0] for tag in re.findall(r'<a href="(?:[^"]*)">([^<]*)</a>', tagshtml): tags.add(cleantext(tag)) paras = [] title = re.search(r'<title>(.+?)</title>', html) if title: paras.append( cleantext(striptags(title.group(1).split('| Māori')[0]))) articlehtml = html.split('class="field-body"')[1].split('</div>')[0] paras.extend( [cleantext(p) for p in re.findall(r'<p>(.+?)</p>', articlehtml)]) paras = [p for p in paras if p and p.find(' the ') < 0] # filter out English if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: %s\n' % genre) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) if english: out.write('# Translation.en: %s\n' % english) if tags: out.write('# Tags: %s\n' % ', '.join(tags)) out.write('\n'.join(paras) + '\n')