def _pl_usembassy_gov_path(url): if not urlpath(url).startswith('/pl/'): return False else: if urlpath(url) == '/pl/': return False elif urlpath(url).startswith('/pl/category/'): return False elif urlpath(url).startswith('/pl/tag/'): return False else: return True
def crawl_azattyk_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml') for url in sorted(sitemap.keys()): if not urlpath(url).startswith('/a/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search(r'"dateModified":"([^"]+)"', html) if pubdate is not None: pubdate = cleantext(pubdate.group(1)).replace(' ', 'T') title = extract('<title>', '</title>', html) text = extract('content-offset">', '</div>', html) if not title or not text: continue paras = [title] + re.sub(r'<br\s*?/?>', '\n', text).splitlines() paras = filter(None, [cleantext(p) for p in paras]) paras = [p for p in paras if not p.startswith('http')] if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_azattyk_org(crawler, out): sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml') for url in sorted(sitemap.keys()): if not urlpath(url).startswith('/a/'): continue doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') pubdate = re.search(r'"dateModified":"([^"]+)"', html) if pubdate is not None: pubdate = cleantext(pubdate.group(1)).replace(' ', 'T') title = extract('<title>', '</title>', html) text = extract('content-offset">', '<footer', html) if not title or not text: continue text = text.split('<span class="share')[0] text = text.split('<div class="region"')[0] text = text.replace('\n', ' ') paras = [title] + re.sub(r'<(?:br|p|div)\s*?/?>', '\n', text).splitlines() paras = filter(None, [cleantext(p.strip()) for p in paras]) paras = [p for p in paras if not p.startswith('http')] if not paras: continue # Filter out English text. if ord(paras[0][0]) <= 0xFF or ord(paras[-1][-1]) <= 0xFF: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def crawl_pl_usembassy_gov(crawler, out): sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml') trans_regex = re.compile( r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"' ) pubdate_regex = re.compile( r'<meta property="article:published_time" content="([^"]*)"' ) links = set() for key in sorted(sitemap.keys()): if _pl_usembassy_gov_path(key): links.add(key) for link in sorted(links): result = crawler.fetch(link) if result.status != 200: continue html = result.content.decode('utf-8') title = extract('<title>', '</title>', html) title = title if title else '' title = title.split(' | ')[0] if ' | ' in title else title pubdate_match = pubdate_regex.search(html) pubdate = pubdate_match.group(1) if pubdate_match else None trans_match = trans_regex.search(html) trans = trans_match.group(1) if trans_match else None if pubdate is None: pubdate = result.headers.get('Last-Modified') if pubdate is None: pubdate = sitemap[link] exstart = '<div class="entry-content">' exstart2 = '<div class="mo-page-content">' exend = '<!-- AddThis Advanced Settings above via filter on the_content -->' exstart = exstart2 if exstart2 in html else exstart content = extract(exstart, exend, html) cleanparas = clean_paragraphs(content) if content else None # Don't repeat the title if it's the only text content cleantitle = cleantext(title) if cleanparas: if len(cleanparas) == 1 and cleanparas[0] == cleantitle: paras = [cleantitle] else: paras = [cleantitle] + cleanparas else: paras = [cleantitle] # There are quite a few media pages whose only text is the filename # this, conveniently, is typically also the post's name if len(paras) == 1 and paras[0].lower() in urlpath(link).lower(): continue if paras: out.write('# Location: %s\n' % link) out.write('# Genre: Diplomatic\n') if trans: out.write('# Translation: %s\n' % trans) if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def crawl_dimma_fo(crawler, out): num_pages = int(re.search( r'<a href="http://www.dimma.fo/(\d+)" class="to-last"', crawler.fetch('http://www.dimma.fo/').content).group(1)) urls = set() for i in range(1, num_pages + 1): doc = crawler.fetch('http://www.dimma.fo/%d' % i) html = doc.content.decode('utf-8') for u in re.findall(r'href="(http://www.dimma.fo/[^"]+?)"', html): path = urlpath(u) if re.match(r'/\d+', path) or u'/' in path[1:]: continue urls.add(u) for url in sorted(urls): doc = crawler.fetch(urlencode(url)) if doc.status != 200: continue html = doc.content.decode('utf-8') content = html.split('class="content">')[1] pubdate = re.search( r'<span class="date">\s*' r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*</span>', content) if pubdate != None: pubdate = '%sT%s:00+01:00' % (pubdate.group(1), pubdate.group(2)) paragraphs = [] title = re.search(r'<h1>(.+?)</h1>', html, flags=re.DOTALL) if title != None: paragraphs.append(cleantext(title.group(1))) text = content.split('<p>', 1)[1].split('</div>')[0] text = text.replace('\n', ' ').replace('</p>', '\n') text = text.replace('<br />', '\n') paragraphs.extend([cleantext(p) for p in text.splitlines()]) paragraphs = filter(None, paragraphs) if paragraphs: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paragraphs: out.write(p + '\n')
def _rtenuacht_path(url): rtenuacht = urlpath(url).startswith('/news/nuacht/') rnagnuacht = urlpath(url).startswith('/rnag/nuacht-gaeltachta') return rtenuacht or rnagnuacht
def _rtenuacht_path(url): rtenuacht = urlpath(url).startswith('/news/nuacht/') rnag = '/rnag/nuacht' in url or '/rnag/articles' in url return rtenuacht or rnag