Python cleantext示例，corpuscrawler.util.cleantext Python示例

示例#1

0

显示文件

文件： crawl_kar.py 项目： usApp-stAck/corpuscrawler

def crawl_kicnews(crawler, out):
    urls = find_wordpress_urls(crawler, 'http://karen.kicnews.org/')
    urls = [u for u in urls if '%' in u]
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        html = doc.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html).group(1)
        pubdate = re.search(
            r'<meta itemprop="datePublished" content="([^"]+)">', html)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        try:
            text = html.split(
                '<div class="td-post-content td-pb-padding-side">',
                1)[1].split('<div class="essb_links')[0]
            text = text.replace('\n', ' ').replace('</p>', '\n')
        except Exception as e:
            print('No content:     %s' % url)
            continue
        paras = [cleantext(p) for p in [title] + text.splitlines()]
        paras = filter(None, paras)
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#2

0

显示文件

文件： crawl_ky.py 项目： keshan/corpuscrawler

def crawl_azattyk_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if not urlpath(url).startswith('/a/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        if pubdate is not None:
            pubdate = cleantext(pubdate.group(1)).replace(' ', 'T')
        title = extract('<title>', '</title>', html)
        text = extract('content-offset">', '</div>', html)
        if not title or not text:
            continue
        paras = [title] + re.sub(r'<br\s*?/?>', '\n', text).splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        paras = [p for p in paras if not p.startswith('http')]
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

示例#3

0

显示文件

def crawl_kwayedza(crawler, out):
    urls = find_wordpress_urls(crawler, site='http://www.kwayedza.co.zw/')
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            html = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1) if title else None
        if html.find('itemprop="articleBody"') < 0:
            continue
        pubdate = re.search(r'datetime="(.+?)" itemprop="datePublished"', html)
        if pubdate:
            pubdate = cleantext(pubdate.group(1))
        body = html.split('itemprop="articleBody"', 1)[1].split('>', 1)[1]
        body = body.split('<!-- .post-content -->')[0]
        body = body.split('<div class="post-share">')[0]
        body = body.replace('</p>', '\n').replace('</div>', '\n')
        paras = [title] + body.splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#4

0

显示文件

def crawl_larenadomila_it(crawler):
    out = crawler.get_output(language='vec-u-sd-itvr')
    urls = find_urls_in_larenadomila_it(
        crawler, 'https://www.larenadomila.it/sito/index.php')
    for url in sorted(urls.difference(BLACKLISTED_URLS)):
        if url.find('&view=article&') < 0:
            continue
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, start_url)
        content = doc.content.decode('utf-8')
        title = cleantext(extract('<title>', '</title>', content))
        sections = [title] + [c.strip() for c in content.splitlines()]
        sections = [c for c in sections
                    if c.startswith('<div class="item_fulltext">')
                    or c.startswith('<p><span class="grassetto">')]
        sections = [c.replace(' <br />- ', ' ') for c in sections]
        text = '<br/>'.join(sections)
        text = text.replace('&nbsp;', ' ')  # used for spacing/formatting
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table)>', '\n', text)
        text = re.sub(r'<br\s*/?>', '\n', text)
        text = re.sub(r'\.{3,}', '… ', text)
        text = re.sub(r'\n(-)[^\s]', '- ', text)
        paras = filter(None, [cleantext(p) for p in text.split('\n')])
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('\n'.join(paras) + '\n')

示例#5

0

显示文件

def crawl_irishtimes(crawler, out):
    start = 'https://www.irishtimes.com/culture/treibh'
    pubdatere1 = re.compile(
        r'<meta itemprop="datePublished" content="([^"]*)"/>')
    pubdatere2 = re.compile(r'"datePublished": "([^"])"')
    links = set()
    for contents in _irishtimes_section_list(crawler, out, start):
        init = crawler.fetch(contents)
        if init.status != 200:
            continue
        shtml = init.content.decode('utf-8')
        for doclink in re.findall('<p><a href="/culture/treibh/([^"]*)"',
                                  shtml):
            links.add('%s/%s' % (start, doclink))
    for url in links:
        res = crawler.fetch(url)
        if res.status != 200:
            continue
        html = res.content.decode('utf-8')
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        title = re.search(r'<title>(.+?)</title>', html)
        pubdate_match = pubdatere1.search(html)
        pubdate_match = pubdate_match if pubdate_match else pubdatere2.search(
            html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified')
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        if title: out.write(cleantext(title.group(1)) + '\n')
        for paragraph in re.findall(
                r'<p class="no_name">(.+?)</p>',
                html.split('<div class="article_bodycopy">')[1]):
            cleaned = cleantext(paragraph)
            out.write(cleaned + '\n')

示例#6

0

显示文件

文件： crawl_mr.py 项目： zhezhe123/corpuscrawler

def crawl_loksatta_com(crawler, out):
    sitemap = crawler.fetch_sitemap('http://www.loksatta.com/sitemap.xml')
    for url in sorted(sitemap):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            html = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        pubdate = re.search(
            r'<meta itemprop="datePublished" content="(.+?)"', html)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        headline = extract('<h1 itemprop="headline" id="headline">', '</h1>',
                           html)
        synopsis = extract('<h2 itemprop="description" class="synopsis">',
                           '</h2>', html)
        text = extract('itemprop="articleBody">', '<div', html)
        if not text:
            continue
        text = text.replace('\n', ' ')
        text = re.sub(r'</?(?:br|BR|p|P)\s*?/?>', '\n', text)
        paras = [headline, synopsis] + text.splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        if paras:
            out.write('# Location: %s\n# Genre: News\n' % url)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#7

0

显示文件

def crawl_nuachtrte(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'http://www.rte.ie/sitemap.xml',
        subsitemap_filter=lambda x: _check_rte_sitemap(x))
    pubdate_regex = re.compile(
        r'name="DC.date" (?:scheme="DCTERMS.URI" )?content="([0-9T:+\-]{19,25})"'
    )
    for url in sorted(sitemap.keys()):
        if not _rtenuacht_path(url):
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[url]
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        title = re.search(r'<title>(.+?)</title>', html)
        if title: title = striptags(title.group(1).split('- RTÉ')[0]).strip()
        if title: out.write(cleantext(title) + '\n')
        for paragraph in re.findall(r'<p>(.+?)</p>', html):
            cleaned = cleantext(paragraph)
            if _rte_writable_paragraph(cleaned):
                out.write(cleaned + '\n')
            else:
                continue

示例#8

0

显示文件

def crawl_azattyk_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if not urlpath(url).startswith('/a/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        if pubdate is not None:
            pubdate = cleantext(pubdate.group(1)).replace(' ', 'T')
        title = extract('<title>', '</title>', html)
        text = extract('content-offset">', '<footer', html)
        if not title or not text:
            continue
        text = text.split('<span class="share')[0]
        text = text.split('<div class="region"')[0]
        text = text.replace('\n', ' ')
        paras = [title] + re.sub(r'<(?:br|p|div)\s*?/?>', '\n', text).splitlines()
        paras = filter(None, [cleantext(p.strip()) for p in paras])
        paras = [p for p in paras if not p.startswith('http')]
        if not paras:
            continue
        # Filter out English text.
        if ord(paras[0][0]) <= 0xFF or ord(paras[-1][-1]) <= 0xFF:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

示例#9

0

显示文件

def crawl_shannews(crawler, out):
    urls = find_wordpress_urls(crawler,
                               'https://shannews.org/archives/',
                               allow_404=True)
    urls = [
        u for u in urls if re.match(r'^https://shannews.org/archives/\d+$', u)
    ]
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        html = doc.content.decode('utf-8')
        title = re.search(r'<h1 class="entry-title">(.+?)</h1>', html).group(1)
        pubdate = re.search(
            r'<meta itemprop="datePublished" content="([^"]+)">', html)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        try:
            text = html.split('<div class="td-post-content">', 1)[1] \
                .split('<div id="fb-root">')[1] \
                .split("<div class='heateorFfcClear'>")[0] \
                .replace('\n', ' ').replace('</p>', '\n')
        except Exception as e:
            print('No content:     %s' % url)
            continue
        paras = [cleantext(p) for p in [title] + text.splitlines()]
        paras = filter(None, paras)
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#10

0

显示文件

文件： crawl_tpi.py 项目： wannaphong/corpuscrawler

def crawl_wantokniuspepa_com(crawler, out):
    sections = {
        'abc-pasifik-nius', 'bisnis-nius', 'helt-nius', 'komentri',
        'laip-stail', 'meri-nius', 'nius', 'wantok'
    }
    seeds = set()
    for section in sorted(sections):
        section_url = 'http://wantokniuspepa.com/index.php/%s' % section
        seeds.add(section_url)
        section_index = crawler.fetch(section_url)
        assert section_index.status == 200, (section_index.status, section_url)
        last_page = re.search('"End" href=".+?start=(\d+)" class="pagenav"',
                              section_index.content.decode('utf-8'))
        if last_page is not None:
            for page in range(1, int(last_page.group(1)) + 1):
                seeds.add('http://wantokniuspepa.com/index.php/%s?start=%d' %
                          (section, page))
    urls = set()
    for seed in sorted(seeds):
        doc = crawler.fetch(seed)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        for u in re.findall(r'(/index\.php/[^"]+?)"', content):
            p = u.split('/')
            if len(p) > 3 and p[1] == 'index.php' and p[2] in sections:
                if re.search(r'/\d{4,}', u) is not None:
                    urls.add('http://wantokniuspepa.com' + u.split('?')[0])
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        title = extract('<title>', '</title>', content)
        pubdate = re.search(
            r'<time datetime="([^T]+?)T([^"]+?)" '
            'itemprop="datePublished">', content)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        body = extract('<div itemprop="articleBody">', '<ul class="pager',
                       content)
        if not body:
            continue
        body = body.split('<div class="clearfix"')[0]
        text = body.replace('\n', ' ')
        text = text.replace(' ,', ',').replace('“ ', '“')
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text)
        text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text)
        paras = [cleantext(p) for p in [title] + text.splitlines()]
        paras = filter(None, paras)
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#11

0

显示文件

def crawl_titus_avestan(crawler, out, out_latin):
    for page in range(1, 249):
        url = ('http://titus.uni-frankfurt.de/texte/etcs/iran/airan/avesta/' +
               'avest%03d.htm' % page)
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        html = doc.content.decode('utf-8')
        for chapter_id, chapter in enumerate(
                re.split('<span id=(?:h3|subtitle)>', html)[1:]):
            chapter = chapter.replace('<SUP>\u030A</SUP>', '\u030A')
            chapter = chapter.replace('<SUP>v</SUP>', '\u1D5B')
            chapter = chapter.replace('β', '\uA7B5')  # LATIN SMALL LETTER BETA
            chapter = chapter.replace('δ', 'ẟ')  # LATIN SMALL LETTER DELTA
            title = re.search(r'<a id=subtitle[^>]*>(.+?)</a>', chapter)
            text = [title.group(1) if title else '']
            for paragraph in chapter.split('Paragraph')[1:]:
                cur_paragraph = []
                for verse in paragraph.split('Verse')[1:]:
                    verse = cleantext(verse.split('>', 1)[1])
                    verse = verse.split('This')[0]
                    verse = re.sub(r'(\s*:+\s*)',
                                   lambda m: ' ' + m.group(1).strip() + ' ',
                                   verse)
                    verse = re.sub('\.{2,}', '…', verse)
                    for c in '+*^':
                        verse = verse.replace(c, ' ')
                    verse = re.sub(r'[\s\.\d]+\)[\s\.]', ') ', verse)
                    verse = re.sub(r'[\s\.\d]+\]\.*', '] ', verse)
                    verse = re.sub(r'\{[^}]+\}', ' ', verse)
                    verse = re.sub(r'\(~[^)]+\)', ' ', verse)
                    verse = re.sub(r'[\s\.\d]*(:+)[\s\.\d]*',
                                   lambda m: m.group(1) + ' ', verse)
                    words = [w.strip('0123456789.') for w in verse.split()]
                    verse = cleantext(' '.join(words)).lower()
                    verse = verse.replace(': :', '::')
                    cur_paragraph.append(verse)
                p = ' '.join(cur_paragraph)
                p = re.sub(r'[^:]::[^:]', '. ', p)
                p = re.sub(r'[^:]::$', '. ', p) + ' '
                sentences = []
                for s in p.split('. '):
                    if len(s) > 1:
                        s = ' '.join(s.split())
                        sentences.append(s[0].title() + s[1:] + '. ')
                p = '. '.join(sentences).strip()
                p = p.replace('. .', '.')
                text.append(unicodedata.normalize('NFC', p))
            paras = filter(None, text)
            out.write('# Location: %s#%d\n' % (url, chapter_id + 1))
            out_latin.write('# Location: %s#%d\n' % (url, chapter_id + 1))
            out_latin.write('\n'.join(paras) + '\n')
            out.write(untransliterate('\n'.join(paras)) + '\n')

示例#12

0

显示文件

文件： crawl_lt.py 项目： zhezhe123/corpuscrawler

def _crawl_kauno_diena_lt(crawler, out):
    urls = {}
    for i in range(1, 6):
        url = 'http://kauno.diena.lt/sitemap/kd/sitemap%d.xml' % i
        urls.update(crawler.fetch_sitemap(url))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            html = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        title = extract('<h1 class="title" id="page-title">', '</h1>', html)
        title = cleantext(title if title else '')
        body = extract("<span itemprop='articleBody'>", '</div>', html) or ''
        paras = []
        for p in clean_paragraphs('%s<br/>%s' % (title, body)):
            if 'MicrosoftInternetExplorer4' in p:
                break
            paras.append(p)
        pubdate = re.search(
            r'<span\s+property="dc:date\s+dc:created"\s+content="(20[^"]+)"',
            html)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#13

0

显示文件

def crawl_areena_yle_fi(crawler, out):
    for offset in range(0, 3000, 10):
        url = ('https://areena.yle.fi/api/programs/v1/items.json?'
               'series=1-1931339&type=program&availability=ondemand&'
               'order=episode.hash%3Adesc%2Cpublication.starttime%3Adesc%2C'
               'title.fi%3Aasc&app_id=89868a18&'
               'app_key=54bb4ea4d92854a2a45e98f961f0d7da&'
               'limit=10&offset=' + str(offset))
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = json.loads(doc.content)
        data = content.get('data')
        if not data:
            return
        for item in data:
            title = item.get('itemTitle', {}).get('fi')
            description = item.get('description', {}).get('fi', '')
            paras = filter(None, [title] + description.splitlines())
            paras = filter(None, [cleantext(p) for p in paras])
            paras = [
                p for p in paras
                if not (p.startswith('(') or p.startswith('Nuntii Latini'))
            ]
            publications = item.get('publicationEvent', [])
            pubdates = filter(None, [e.get('startTime') for e in publications])
            pubdate = min(pubdates) if pubdates else None
            if paras:
                out.write('# Location: %s\n' % item['@id'])
                out.write('# Genre: News\n')
                out.write('# Publication-Date: %s\n' % pubdate)
                out.write('\n'.join(paras) + '\n')

示例#14

0

显示文件

文件： crawl_hy.py 项目： zhezhe123/corpuscrawler

def crawl_azg_am(crawler, out):
    urls = set()
    for d in daterange(date(2001, 1, 9), date.today()):
        datestr = '%04d%02d%02d00' % (d.year, d.month, d.day)
        url = 'http://www.azg.am/AM/%s' % datestr
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        articles = [
            a for a in re.findall(r'20\d{8}', content) if not a.endswith('00')
        ]
        for a in articles:
            urls.add('http://www.azg.am/wap/?nl=AM&id=%s&Base_PUB=0' % a)
        print(len(urls))
    for url in sorted(urls):
        pubdate = re.search(r'id=(20\d{6})', url).group(1)
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        text = extract('<hr>', '<hr>', content)
        text = text.replace('\n', ' ')
        text = re.sub('</(p|h[1-9]|div)>', '\n', text)
        paras = filter(None, [cleantext(p) for p in text.splitlines()])
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Publication-Date: %s-%s-%s\n' %
                      (pubdate[:4], pubdate[4:6], pubdate[6:8]))
            out.write('\n'.join(paras) + '\n')

示例#15

0

显示文件

def crawl_chg(crawler, out):
    def _chg_content(page):
        return page.split('<div class="container" id="article">')[1].split(
            '<!-- /.right columns -->')[0]

    sitemap = 'https://www.chg.gov.ie/ga/help/sitemap/'
    res = crawler.fetch(sitemap)
    if res.status != 200:
        return
    links = set()
    html = res.content.decode('utf-8')
    body = _chg_content(html)
    for pagelink in re.findall('<a href="([^"]*)">', body):
        if pagelink.startswith('https://www.chg.gov.ie/ga/'):
            links.add(pagelink)
    for link in links:
        pres = crawler.fetch(link)
        if pres.status != 200:
            continue
        phtml = pres.content.decode('utf-8')
        ptext = _chg_content(phtml)
        title = re.search(r'<title>(.+?)</title>', phtml)
        if title: title = striptags(title.group(1).split('|')[0]).strip()
        pubdate = pres.headers.get('Last-Modified')
        out.write('# Location: %s\n' % link)
        out.write('# Genre: Government\n')
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        for paragraph in re.findall(r'<p>(.+?)</p>', ptext):
            cleaned = cleantext(paragraph)
            out.write(cleaned + '\n')

示例#16

0

显示文件

def crawl_quatrociacoe_it(crawler):
    out = crawler.get_output(language='vec-u-sd-itpd')
    urls = set()
    main = crawler.fetch('http://www.quatrociacoe.it/')
    assert main.status == 200, main.status
    for e in re.findall(r'href="/(\d{6})/\d{6}\.php"', main.content):
        ed = crawler.fetch('http://www.quatrociacoe.it/%s/%s.php' % (e, e))
        assert ed.status == 200, ed.status
        for path in re.findall(r'href="(/%s/.+?\.php)"' % e, ed.content):
            if path != '/%s/%s.php' % (e, e):
                urls.add('http://www.quatrociacoe.it' + path)
    for url in sorted(urls):
        if url in BLACKLISTED_URLS:
            continue
        doc = crawler.fetch(url)
        assert doc.status == 200, doc.status
        encoding = re.search(r'html;\s*charset=([\-a-zA-Z0-9]+)"', doc.content)
        encoding = encoding.group(1).lower() if encoding else 'utf-8'
        assert encoding in ('iso-8859-1', 'utf-8'), (encoding, url)
        content = doc.content.decode(encoding)
        text = extract('<!-- *** INIZIO ARTICOLO ***-->',
                       '<!-- *** FINE ARTICOLO ***-->', content)
        if not text:
            continue
        year, month = re.search(r'/(20\d{2})(\d{2})/', url).groups()
        text = text.replace('\n', ' ').replace('\r', ' ')
        text = re.sub('Torna\s+alla pagina principale', ' ', text)
        text = text.replace('[torna sopra]', ' ')
        text = re.sub(r'<!--.+?-->', '', text, flags=re.DOTALL)
        text = re.sub(r' alt="[^"]+"', ' ', text, flags=re.DOTALL)
        text = text.replace('\u0091', '’')  # misuse of U+0091 PRIVATE USE ONE
        text = text.replace('\u0092', '’')  # misuse of U+0092 PRIVATE USE TWO
        text = text.replace('<<', '«').replace('>>', '»')  # invalid HTML
        text = text.replace('&lt;&lt;', '«').replace('&gt;&gt;', '»')
        text = re.sub('\.{3,}', '…', text)
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text)
        text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text)
        paras = filter(None, [cleantext(p) for p in text.splitlines()])
        text = re.sub(r'<img.+?\n">', ' ', '\n'.join(paras))
        paras = filter(None, [cleantext(p) for p in text.splitlines()])
        out.write('# Location: %s\n' % url)
        out.write('# Publication-Date: %s-%s-01\n' % (year, month))
        out.write('# Genre: Fiction\n')
        out.write('\n'.join(paras) + '\n')

示例#17

0

显示文件

文件： crawl_mt.py 项目： zhezhe123/corpuscrawler

def crawl_newsbook_mt(crawler, out):
    urls = set()
    for section in ('internazzjonali', 'muzika', 'madwar-il-hajja',
                    'teknologijja', 'vatikan', 'sports', 'kummerc'):
        section_url = 'http://www.newsbook.com.mt/artikli/%s/' % section
        html = crawler.fetch(section_url).content.decode('utf-8')
        links = re.findall(r'/artikli/%s/(\d+)/' % section, html)
        num_toc_pages = max([int(x) for x in links])
        for i in range(1, num_toc_pages + 1):
            toc_url = section_url
            if i > 1:
                toc_url = toc_url + '%d/' % i
            html = crawler.fetch(toc_url).content.decode('utf-8')
            for u in re.findall('href="(/artikli/\d{4}/.+?)"', html):
                url = urljoin(toc_url, u)
                if url.find('/test') < 0:
                    urls.add(url)
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search(r'<meta content="([^"]+?)" name="title"', html)
        if title is not None:
            title = cleantext(title.group(1))
        pubdate = re.search(
            r'<meta content="([^"]+?)" itemprop="datePublished"', html)
        if pubdate is not None:
            pubdate = pubdate.group(1).strip().replace(' ', 'T') + 'Z'
        content = html.split('<p>', 1)[1].split('<div', 1)[0]
        content = content.replace('\n', ' ').replace('</p>', '\n')
        paras = [
            fixquotes(cleantext(p)) for p in [title] + content.splitlines()
        ]
        paras = filter(None, paras)
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

示例#18

0

显示文件

def crawl_dimma_fo(crawler, out):
    num_pages = int(re.search(
        r'<a href="http://www.dimma.fo/(\d+)" class="to-last"',
        crawler.fetch('http://www.dimma.fo/').content).group(1))
    urls = set()
    for i in range(1, num_pages + 1):
        doc = crawler.fetch('http://www.dimma.fo/%d' % i)
        html = doc.content.decode('utf-8')
        for u in re.findall(r'href="(http://www.dimma.fo/[^"]+?)"', html):
            path = urlpath(u)
            if re.match(r'/\d+', path) or u'/' in path[1:]:
                continue
            urls.add(u)
    for url in sorted(urls):
        doc = crawler.fetch(urlencode(url))
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        content = html.split('class="content">')[1]
        pubdate = re.search(
            r'<span class="date">\s*'
            r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*</span>',
            content)
        if pubdate != None:
            pubdate = '%sT%s:00+01:00' % (pubdate.group(1), pubdate.group(2))
        paragraphs = []
        title = re.search(r'<h1>(.+?)</h1>', html, flags=re.DOTALL)
        if title != None:
            paragraphs.append(cleantext(title.group(1)))
        text = content.split('<p>', 1)[1].split('</div>')[0]
        text = text.replace('\n', ' ').replace('</p>', '\n')
        text = text.replace('<br />', '\n')
        paragraphs.extend([cleantext(p) for p in text.splitlines()])
        paragraphs = filter(None, paragraphs)
        if paragraphs:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            for p in paragraphs:
                out.write(p + '\n')

示例#19

0

显示文件

def crawl_pl_usembassy_gov(crawler, out):
    sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml')
    trans_regex = re.compile(
        r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"'
    )
    pubdate_regex = re.compile(
        r'<meta property="article:published_time" content="([^"]*)"'
    )
    links = set()
    for key in sorted(sitemap.keys()):
        if _pl_usembassy_gov_path(key):
            links.add(key)
    for link in sorted(links):
        result = crawler.fetch(link)
        if result.status != 200:
            continue
        html = result.content.decode('utf-8')
        title = extract('<title>', '</title>', html)
        title = title if title else ''
        title = title.split(' | ')[0] if ' | ' in title else title
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        trans_match = trans_regex.search(html)
        trans = trans_match.group(1) if trans_match else None
        if pubdate is None: pubdate = result.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[link]
        exstart = '<div class="entry-content">'
        exstart2 = '<div class="mo-page-content">'
        exend = '<!-- AddThis Advanced Settings above via filter on the_content -->'
        exstart = exstart2 if exstart2 in html else exstart
        content = extract(exstart, exend, html)
        cleanparas = clean_paragraphs(content) if content else None
        # Don't repeat the title if it's the only text content
        cleantitle = cleantext(title)
        if cleanparas:
            if len(cleanparas) == 1 and cleanparas[0] == cleantitle:
                paras = [cleantitle]
            else:
                paras = [cleantitle] + cleanparas
        else:
            paras = [cleantitle]
        # There are quite a few media pages whose only text is the filename
        # this, conveniently, is typically also the post's name
        if len(paras) == 1 and paras[0].lower() in urlpath(link).lower():
            continue
        if paras:
            out.write('# Location: %s\n' % link)
            out.write('# Genre: Diplomatic\n')
            if trans:
                out.write('# Translation: %s\n' % trans)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#20

0

显示文件

文件： crawl_dz.py 项目： zhezhe123/corpuscrawler

def crawl_kuensel(crawler, out):
    urls = find_wordpress_urls(crawler, 'http://www.dzkuensel.com/')
    urls = [u for u in urls if '%' in u]
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        html = doc.content.decode('utf-8')
        title = re.search(r'<h1[^>]*>(.+?)</h1>', html).group(1)
        pubdate = re.search(r'"datePublished":"(.+?)"', html)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        text = html.split('<div class="entry">', 1)[1].split('<!-- .entry ')[0]
        text = text.replace('\n', ' ').replace('</p>', '\n')
        paras = [cleantext(p) for p in [title] + text.splitlines()]
        paras = filter(None, paras)
        if any(p.startswith('Search for') for p in paras):
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#21

0

显示文件

def _scrape_paiperatapu(crawler, out):
    booklist = list()
    books = crawler.fetch(
        'http://www.paiperatapu.maori.nz/paipera-tapu-online')
    assert books.status == 200, books.status
    bookshtml = books.content.decode('utf-8')
    bookshtmlinner = bookshtml.split('<div class="bible-book-list">')[1].split(
        '<li class="first bible-search">')[0]
    for bookslink in re.findall(r'<a href="(/bible/[0-9]*/[^"]*)">',
                                bookshtmlinner):
        bookurl = 'http://www.paiperatapu.maori.nz' + bookslink
        book = crawler.fetch(bookurl)
        assert book.status == 200, book.status
        bookhtml = book.content.decode('utf-8')
        bookhtmlinner = bookhtml.split('<ul class="bible-chapter-list">')[
            1].split('<div class="bible-links">')[0]
        for chapterlink in re.findall(
                r'<a href="(/bible/[0-9]*/[^/]*/[^"]*)">', bookhtmlinner):
            url = 'http://www.paiperatapu.maori.nz' + chapterlink
            chapter = crawler.fetch(url)
            assert chapter.status == 200, chapter.status
            chapterhtml = chapter.content.decode('utf-8')
            if '<dl class="bible-chapter-content">' not in chapterhtml:
                continue
            out.write('# Location: %s\n' % url)
            title = re.search(r'<title>(.+?)</title>', chapterhtml)
            if title:
                title = striptags(title.group(1).split('| Te')[0]).strip()
            # Title is in English
            if title: out.write('# Title: %s\n' % cleantext(title))
            out.write('# Genre: Religion\n')
            chapterhtmlinner = chapterhtml.split(
                '<dl class="bible-chapter-content">')[1].split(
                    '<div class="bible-chapter-seek">')[0]
            for verse in re.finditer(
                    r'<dt><a name="[^"]*"></a>([^<]*)</dt><dd class="[^"]*">([^<]*)</dd>',
                    chapterhtmlinner):
                out.write('%s %s\n' %
                          (verse.group(1), cleantext(verse.group(2))))

示例#22

0

显示文件

文件： crawl_ha.py 项目： zhezhe123/corpuscrawler

def crawl_naij(crawler, out):
    urls = crawler.fetch_sitemap(
        'https://hausa.naij.com/naij/sitemap/hausa/sitemap.xml').keys()
    urls = sorted([u for u in urls if u.find('hausa') > 0])
    for url in urls:
        doc = crawler.fetch(url).content.decode('utf-8')
        doc = re.sub(r'<script>.+?</script>', '', doc, flags=re.DOTALL)
        pubdate = re.search(r'<meta itemprop="datePublished" content="(.+?)"',
                            doc).group(1)
        title = cleantext(
            re.search(r'<h1.*?>(.+?)</h1>', doc, re.DOTALL).group(1))
        article = '<article' + doc.split('<article')[1].split('<p>Source:')[0]
        paragraphs = [title]
        for text in article.split('</p>'):
            text = cleantext(text)
            if text:
                paragraphs.append(text)
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        out.write('# Publication-Date: %s\n' % pubdate)
        for p in paragraphs:
            out.write(p + '\n')

示例#23

0

显示文件

def _rte_cleanall(html):
    section_article_regex = re.compile(
        r'<section[^>]+itemprop="articleBody"[^>]*>')
    search = section_article_regex.search(html)
    out = []
    if search:
        body = extract(search.group(0), '</section>', html)
        for para in clean_paragraphs(body):
            if _rte_writable_paragraph(para):
                out.append(para)
        return '\n'.join(out)
    for paragraph in re.findall(r'<p>(.+?)</p>', html):
        cleaned = cleantext(paragraph)
        if _rte_writable_paragraph(cleaned):
            out.append(cleaned)
        else:
            continue
    return '\n'.join(out)

示例#24

0

显示文件

def crawl_coislife_ie(crawler, out):
    links = set()
    for num in range(1, 12):
        if num > 1:
            listurl = 'https://www.coislife.ie/product-category/ga/page/%s/' % num
        else:
            listurl = 'https://www.coislife.ie/product-category/ga/'
        idxres = crawler.fetch(listurl)
        if idxres.status != 200:
            continue
        idxhtml = idxres.content.decode('utf-8')
        index = extract('<div class="products-archive--products">',
                        '<nav class="woocommerce-pagination">', idxhtml)
        for link in re.findall(
                r'<a href="(https://www.coislife.ie/product/[^"]+?)">', index):
            links.add(link)
    for url in sorted(links):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('&#8211;')[0].strip() if title else ''
        desc = re.search(r'<meta property="og:description" content="([^"]+?)"',
                         html)
        desc = cleantext(desc.group(1))
        body = extract(
            '<div class="tab-content">',
            '<div class="entry-content in fade tab-pane" id="tab-additional_information">',
            html) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        pubdate = fetchresult.headers.get('Last-Modified')
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Commerce\n')
            if desc:
                out.write('# Description: %s\n' % desc)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            for para in paras:
                if para.find('Léigh sliocht as an leabhar') >= 0:
                    continue
                else:
                    out.write(para + '\n')

示例#25

0

显示文件

文件： crawl_pa.py 项目： zhezhe123/corpuscrawler

def crawl_jagbani_punjabkesari_in(crawler, out):
    urls = set()
    main = crawler.fetch('http://jagbani.punjabkesari.in/')
    assert main.status == 200, main.status
    menu = extract('<nav id="menu" class="menu">', '</nav>',
                   main.content.decode('utf-8'))
    urls_re = re.compile(r'href="(https?://jagbani\.punjabkesari\.in/[^"]+?)"')
    category_urls = urls_re.findall(menu)
    for category_url in sorted(set([x.strip() for x in category_urls])):
        for page in range(1, 1000):
            doc = crawler.fetch(category_url + '/page/%d' % page)
            content = doc.content.decode('utf-8') if doc.status == 200 else ''
            if content.find('class="story"') < 0:
                break
            for u in urls_re.findall(
                    extract('<span class="story">', '<div class="kjpage"',
                            content)):
                urls.add(urlencode(u.strip()))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            content = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        title = extract('<title>', '</title>', content)
        text = extract('<article>', '</article>', content)
        if not text:
            continue
        text = re.sub(r'<br[^a-zA-Z][^>]*>', '<br>', text)
        text = text.replace('\n', ' ').replace('<br>', '\n')
        paras = [title] + text.splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        pubdate = re.search(
            '<meta property="article:published_time" content="([^"]+?)"',
            content)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write(('\n'.join(paras) + '\n'))

示例#26

0

显示文件

文件： crawl_osa.py 项目： zhezhe123/corpuscrawler

def _crawl_osagelanguagetools(crawler, out):
    for database in ('578', 'Approved+Words'):
        url = ('http://osagelanguagetools.appspot.com/words/getPhrases/'
               '?filterStatus=&databases=%s&sortCriteria=index' % database)
        html = crawler.fetch_content(url)
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Dictionary\n')
        out.write('# Publication-Date: 2017\n')
        for row in re.findall(r'<tr.+?</tr>', html, flags=re.DOTALL):
            row = row.replace('\n', ' ')
            text = re.search('<td class="unicodeOsageText.+?>(.+?)</td>', row)
            if not text:
                continue
            text = text.group(1)
            text = text.replace('(Myrtle)', ' ').replace('(Mogri)', ' ')
            text = cleantext(text)
            if text.startswith('Teach'):
                continue
            out.write('%s\n' % text)

示例#27

0

显示文件

def crawl_wikisource_trieste_vernacola(crawler):
    out = crawler.get_output(language='vec-u-sd-itts')
    urls = set()
    index = crawler.fetch(
        'https://vec.wikisource.org/wiki/Indice:Trieste_vernacola.djvu')
    assert index.status == 200, index.status
    remarks = extract('<div id="remarks">', 'Colombe</a>',
                      index.content.decode('utf-8'))
    for urlpath in sorted(set(re.findall(r'href="(/wiki/[^"]+)"', remarks))):
        if not urlpath.startswith('/wiki/Trieste_vernacola/'):
            urls.add('https://vec.wikisource.org' + urlpath)
    for url in sorted(urls.difference(BLACKLISTED_URLS)):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        text = extract('<div id="scatola" class="testo">', '<noscript>',
                       content)
        text = text.split('<dt>Note</dt>')[0].split('<dl>')[0]
        text = text.replace('\n', ' ')
        text = re.sub(r'<sup.+?</sup>', '', text)
        text = text.replace('&#160;', ' ')  # NBSP used for spacing
        text = text.replace("'", "’")
        text = re.sub(r'<!--.+?-->', '', text, flags=re.DOTALL)
        text = re.sub(r' alt="[^"]+"', ' ', text, flags=re.DOTALL)
        text = re.sub(r'<span class="numeroriga".+?</span>', '', text)
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text)
        text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text)
        lines = [l for l in text.splitlines()
                 if l.find('noprint') < 0 and l.find('font-size:smaller') < 0]
        text = '\n'.join([cleantext(l) for l in lines])
        text = re.sub('\n{2,}', '<p>', text).replace('\n', ' | ')
        text = text.replace('<p>', '\n')
        paras = filter(None, [' '.join(p.split()) for p in text.splitlines()])
        if not paras:
            continue
        # The book, published in 1920, is a collection of earlier lyrics.
        pubyear = re.search(r'<span id="ws-year">(\d{4})</span>', content)
        pubyear = int(pubyear.group(1)) if pubyear else 1920
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Lyrics\n')
        out.write('# Publication-Date: %d\n' % pubyear)
        out.write('\n'.join(paras) + '\n')

示例#28

0

显示文件

文件： crawl_taq.py 项目： keshan/corpuscrawler

def crawl_tamurt(crawler, out):
    for url in sorted(find_tamurt_urls(crawler)):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        html = doc.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html).group(1)
        title = title.rstrip(' - Tamurt')
        pubdate = re.search(
            r'<meta property="article:published_time" content="([^"]+)"', html)
        pubdate = pubdate.group(1).strip()
        content = '<div ' + html.split('<div class="entry-content"', 1)[1]
        content = content.split('<!-- .entry-content -->')[0]
        content = re.sub(r'<!--.+?-->', '', content)
        paras = [title] + content.replace('</p>', '\n').splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Blog\n')
        out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

示例#29

0

显示文件

def crawl_nupepa_org(crawler, out):
    urls = set()
    for i in range(1, 104):
        url = ('http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=p-0nupepa--'
               '00-0-0--010---4-----text---0-1l--1en-Zz-1---20-about---'
               '0003-1-0000utfZz-8-00&a=d&cl=CL2.' + str(i))
        doc = crawler.fetch(url)
        assert doc.status == 200, url
        content = doc.content.decode('utf-8')
        for u in re.findall(r'href="(/gsdl2.5/cgi-bin/nupepa[^"]+)"', content):
            if u.endswith('gg=text'):
                urls.add('http://nupepa.org' + replace_html_entities(u))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, url
        content = doc.content.decode('utf-8')
        if content.find('Document contains no data') >= 0:
            continue
        pubdate = re.search(r'tif_([0-9]{4})([01][0-9])([0123][0-9])\.tif"', content)
        pubdate = '%s-%s-%s' % (pubdate.group(1), pubdate.group(2), pubdate.group(3)) if pubdate else None
        paras = []
        while True:
            text = extract(
                "<p class=MsoNormal style='text-autospace:none'><span style='font-size:10.0pt'>",
                "</table>", content)
            if not text:
                break
            text = text.replace('\n', ' ').replace('<br>', '\n')
            text = replace_html_entities(text.replace('&nbsp;', ' '))
            paras.extend([cleantext(p) for p in text.splitlines()])
            nexturl = re.search(r'<a href="([^"]+)">next page', content)
            if nexturl is None:
                break
            nexturl = 'http://nupepa.org' + replace_html_entities(nexturl.group(1))
            doc = crawler.fetch(nexturl)
            assert doc.status == 200, (doc.status, nexturl)
            content = doc.content.decode('utf-8')
        text = '\n'.join(filter(None, paras))
        text = re.sub(
            r'DEATH OF MR\. DOUGLAS.+?has not been heard of since\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'IV\. "Their Majesties do further agree.+?by the parties\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'1 Oh, come, come away, from labor now reposing,.+?'
            r'Honolulu, Nov\. 25, 1861\. J\. L\. N\.\*', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'Died at sea, August 14.+?after a passage of about a month\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'On the 26th ult\. the Rev\. J.+?best wishes to you all\."', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'The subscriber avails himself.+?agreeable circumstances\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'NOTICE\. The publishing of.+for want of paper\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'ARRIVALS AT OAHU, SANDWICH ISLANDS,.+Sold here to the Government\.',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'NOTICE\. NOTICE is hereby given,.+by the subscriber\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'Articles made and agreed.+?upon the Sandwich Islands\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'MRS\. MARIA M\. DIBBLE\. Died at Lahainaluna.+?SHELDON DIBBLE\.',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'DEATH OF MRS\. BETSEY C\. LYONS.+?the son of man cometh\.\"', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'CARD\. The Missionary Company.+?April 20th 1837\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'DISTRESS OF THE WHALE SHIP GEORGE.+?who is now master of her\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'KNOW ALL MEN, That according.+?especially those above re-', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'cited, of the said Commissioners.+?and acknowledge the Protest', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'and withdrawal of our Deputy as our own.+?in the dominions of the Queen of',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'Taheite that I have received instructions.+?Commodore\. \[Official Copy\]',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'TO HIS MAJ\. KAMEHAMEHA.+?Naval Force in the E\. Indies\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'To the House of Representatives of the United States.+?'
            r'the arts of civilized life\.', '', text, flags=re.DOTALL)
        text = re.sub(
            r'It cannot but be in conformity.+?right to complain\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'The Committee on Foreign Affairs, to whom was.+?peace and love\.',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'WASHINGTON, June 25th, 1843.+?treat upon all occassions, the', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'native rulers of the Sandwich.+?P\. Upshur, &c\. &c\.', '',
            text, flags=re.DOTALL)
        if text.startswith('TERMS. One copy'):  # Article entirely in English.
            continue
        paras = filter(None, [cleantext(p) for p in text.splitlines()])
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#30

0

显示文件

def _scrape_maoritelevision(crawler, out):
    articlelist = set()
    articlelist.add('http://www.maoritelevision.com/mi/purongo/purongo-hou')
    articlelist.add('http://www.maoritelevision.com/mi/purongo/hakinakina')
    for i in range(1, 101):
        articlelist.add(
            'http://www.maoritelevision.com/mi/purongo/purongo-hou?page=%d' %
            i)
        articlelist.add(
            'http://www.maoritelevision.com/mi/purongo/hakinakina?page=%d' % i)
    links = set()
    pubdate_regex = re.compile(r'<time datetime="([0-9T:+\-]{25})"')
    for url in articlelist:
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        for articlepiece in content.split('<article')[1:]:
            for artlink in re.findall('<a href="(/mi/purongo/[^"]*)"',
                                      articlepiece):
                if not artlink.startswith('/mi/purongo/purongo-hou'):
                    links.add('http://www.maoritelevision.com%s' % artlink)
    for url in links:
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        if 'a-motu/rereatea-midday-news' in url:
            continue
        html = doc.content.decode('utf-8')
        if 'lang="mi"' not in html:
            continue
        if 'itemprop="articleBody"' not in html:
            continue
        genre = 'Sport' if '/hakinakina/' in url else 'News'
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = doc.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[url]
        # These news stories are a parallel (or at least comparable) corpus, so keeping
        # the link to the English article
        english = re.search(
            r'<a href="(/news/[^"]*)" class="language-link" lang="en">', html)
        if english:
            english = 'http://www.maoritelevision.com%s' % english.group(1)
        tags = set()
        if '<ul class="tags">' in html:
            tagshtml = html.split('<ul class="tags">')[1].split('</ul>')[0]
            for tag in re.findall(r'<a href="(?:[^"]*)">([^<]*)</a>',
                                  tagshtml):
                tags.add(cleantext(tag))
        paras = []
        title = re.search(r'<title>(.+?)</title>', html)
        if title:
            paras.append(
                cleantext(striptags(title.group(1).split('| Māori')[0])))
        articlehtml = html.split('class="field-body"')[1].split('</div>')[0]
        paras.extend(
            [cleantext(p) for p in re.findall(r'<p>(.+?)</p>', articlehtml)])
        paras = [p for p in paras
                 if p and p.find(' the ') < 0]  # filter out English
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: %s\n' % genre)
        if pubdate: out.write('# Publication-Date: %s\n' % pubdate)
        if english: out.write('# Translation.en: %s\n' % english)
        if tags: out.write('# Tags: %s\n' % ', '.join(tags))
        out.write('\n'.join(paras) + '\n')