Python extract示例，corpuscrawler.util.extract Python示例

示例#1

0

显示文件

文件： crawl_mr.py 项目： zhezhe123/corpuscrawler

def crawl_loksatta_com(crawler, out):
    sitemap = crawler.fetch_sitemap('http://www.loksatta.com/sitemap.xml')
    for url in sorted(sitemap):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            html = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        pubdate = re.search(
            r'<meta itemprop="datePublished" content="(.+?)"', html)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        headline = extract('<h1 itemprop="headline" id="headline">', '</h1>',
                           html)
        synopsis = extract('<h2 itemprop="description" class="synopsis">',
                           '</h2>', html)
        text = extract('itemprop="articleBody">', '<div', html)
        if not text:
            continue
        text = text.replace('\n', ' ')
        text = re.sub(r'</?(?:br|BR|p|P)\s*?/?>', '\n', text)
        paras = [headline, synopsis] + text.splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        if paras:
            out.write('# Location: %s\n# Genre: News\n' % url)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#2

0

显示文件

文件： crawl_ky.py 项目： keshan/corpuscrawler

def crawl_azattyk_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if not urlpath(url).startswith('/a/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        if pubdate is not None:
            pubdate = cleantext(pubdate.group(1)).replace(' ', 'T')
        title = extract('<title>', '</title>', html)
        text = extract('content-offset">', '</div>', html)
        if not title or not text:
            continue
        paras = [title] + re.sub(r'<br\s*?/?>', '\n', text).splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        paras = [p for p in paras if not p.startswith('http')]
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

示例#3

0

显示文件

def _crawl_observador_pt(crawler, out):
    urls = set()
    for author_page in sorted(
            re.findall(
                r'href="(https?://observador.pt/perfil/[a-zA-Z_\-0-9]+/)"',
                crawler.fetch_content('http://observador.pt/autores/'))):
        html = crawler.fetch_content(author_page)
        urls.update(
            re.findall(
                r'href="(https?://observador.pt/20\d{2}/\d{2}/\d{2}/[^"]+)"',
                html))
    for url in sorted(urls):
        try:
            html = crawler.fetch_content(url)
        except UnicodeDecodeError:
            continue
        title = re.search(r'<meta property="og:title" content="([^"]+)"', html)
        title = title.group(1) or ''
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        pubdate = pubdate.group(1) or None
        lead = extract('<div class="lead">', '</div>', html) or ''
        content = extract('<div class="content">', '<h1>', html) or ''
        text = '\n'.join(clean_paragraphs('<p>'.join([title, lead, content])))
        text = text.split('\nContinuar a ler')[0]
        text = text.split('\nLer mais')[0]
        text = text.split('\nPartilhe')[0]
        text = text.split('\nComente')[0]
        if text:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write(text)
            out.write('\n')

示例#4

0

显示文件

文件： crawl_lt.py 项目： zhezhe123/corpuscrawler

def _crawl_kauno_diena_lt(crawler, out):
    urls = {}
    for i in range(1, 6):
        url = 'http://kauno.diena.lt/sitemap/kd/sitemap%d.xml' % i
        urls.update(crawler.fetch_sitemap(url))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            html = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        title = extract('<h1 class="title" id="page-title">', '</h1>', html)
        title = cleantext(title if title else '')
        body = extract("<span itemprop='articleBody'>", '</div>', html) or ''
        paras = []
        for p in clean_paragraphs('%s<br/>%s' % (title, body)):
            if 'MicrosoftInternetExplorer4' in p:
                break
            paras.append(p)
        pubdate = re.search(
            r'<span\s+property="dc:date\s+dc:created"\s+content="(20[^"]+)"',
            html)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#5

0

显示文件

def crawl_ainm_ie(crawler, out):
    links = set()
    for let in map(chr, range(65, 91)):
        idxres = crawler.fetch('https://www.ainm.ie/Abc.aspx?Letter=%s' % let)
        if idxres.status != 200:
            continue
        idxhtml = idxres.content.decode('utf-8')
        index = extract('<div id="pageContent" role="main">',
                        '<!-- .contentWrapper-->', idxhtml)
        for link in re.findall(r'<a href="(Bio.aspx\?ID=[^"]+?)">', index):
            links.add('https://www.ainm.ie/%s' % link)
    for url in sorted(links):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('|')[0] if title else ''
        body = extract('<div class="article">', '<!-- .contentWrapper-->',
                       html) or ''
        body = body.split('<div id="machines"')[0]
        paras = clean_paragraphs(title + '<br/>' + body)
        pubdate = fetchresult.headers.get('Last-Modified')
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Biography\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#6

0

显示文件

def _crawl_iltirreno_gelocal_it(crawler, out):
    urls = set()
    for category in ('italia-mondo', 'focus/toscana-economia',
                     'empoli/cronaca', 'grosseto/cronaca',
                     'livorno/cronaca', 'livorno/dagli-enti',
                     'lucca/cronaca', 'pisa/cronaca', 'prato/cronaca',
                     'versilia/cronaca'):
        urls.update(_find_tirreno_urls(crawler, category))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        header = extract('<h1 itemprop="headline name">',
                         '<span itemprop="author"', content) or ''
        body = extract('<span itemprop="articleBody" >', '©', content) or ''
        paras = clean_paragraphs('%s<p/>%s' % (header, body))
        text = '\n'.join(paras)
        for sep in ('Tags\n', 'Redazione | Scriveteci', 'TrovaRistorante',
                    '<a href="', 'I COMMENTI DEI LETTORI', '©RIPRODUZIONE'):
            text = text.split(sep)[0]
        paras = text.splitlines()
        pubdate = re.search(
            r'<time itemprop="datePublished" content="([^"]+)"', content)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#7

0

显示文件

def crawl_blogspot(crawler, out, host):
    sitemap = crawler.fetch_sitemap('https://%s/sitemap.xml' % host)
    pubdate_regex = re.compile(
        r"<abbr class='published' title='([^']*)'>[^<]*</abbr>")
    for url in sorted(sitemap.keys()):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[url]
        title = re.search(r"<meta content='([^']+)' property='og:title'/>",
                          html)
        title = title.group(1) if title else ''
        post = extract("<div class='post-body entry-content'>",
                       "<div class='post-footer'>", html)
        if post == None:
            post = extract("<div class='post-header'>",
                           "<div class='post-footer'>", html)
        if post == None:
            post = extract('<div class="post-body">',
                           '<p class="post-footer">', html)
        paras = clean_paragraphs(title + '<br/>' + post)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Blog\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#8

0

显示文件

def crawl_tuairisc_ie(crawler, out):
    sitemap = crawler.fetch_sitemap('https://tuairisc.ie/sitemap.xml')
    pubdate_regex = re.compile(
        r'<time datetime="(20\d\d-\d\d-\d\d)\s+(\d\d:\d\d)" '
        r'itemprop="datePublished">')
    for url in sorted(sitemap.keys()):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = extract('<h1 class="title article--full__title">', '</h1>',
                        html) or ''
        pubdate_match = pubdate_regex.search(html)
        if pubdate_match:
            pubdate = '%sT%s:00Z' % (pubdate_match.group(1),
                                     pubdate_match.group(2))
        else:
            pubdate = sitemap[url]
        body = extract(
            '<div class="article--full__content" itemprop="articleBody">',
            '</article>', html)
        if not body:
            continue
        paras = clean_paragraphs(title + '<p/>' + body)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#9

0

显示文件

def crawl_meoneile_ie(crawler, out):
    sitemap = crawler.fetch_sitemap('https://meoneile.ie/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if url == 'https://meoneile.ie/':
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = extract(r'<title>', '</title>', html).strip()
        title = title.split('&lt;')[0].strip() if title else ''
        video = re.search(
            r"<iframe.*src='(//player.vimeo.com/video/[0-9]+)[^>]*></iframe>",
            html)
        body = extract("<div class='article-content'>", '</article>',
                       html) or ''
        byline = extract("<div class='byline'>", '</span>', html) or ''
        byline = _byline_to_pubdate(byline)
        if body.find('<strong>%s</strong>' % title) >= 0:
            title = ''
        paras = clean_paragraphs(title + '<br/>' + body)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if video:
                out.write('# Video: https:%s\n' % video.group(1))
            if byline:
                out.write('# Publication-Date: %s\n' % byline)
            for para in paras:
                if para == 'Roinn':
                    continue
                else:
                    out.write(para + '\n')

示例#10

0

显示文件

def crawl_azattyk_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if not urlpath(url).startswith('/a/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        if pubdate is not None:
            pubdate = cleantext(pubdate.group(1)).replace(' ', 'T')
        title = extract('<title>', '</title>', html)
        text = extract('content-offset">', '<footer', html)
        if not title or not text:
            continue
        text = text.split('<span class="share')[0]
        text = text.split('<div class="region"')[0]
        text = text.replace('\n', ' ')
        paras = [title] + re.sub(r'<(?:br|p|div)\s*?/?>', '\n', text).splitlines()
        paras = filter(None, [cleantext(p.strip()) for p in paras])
        paras = [p for p in paras if not p.startswith('http')]
        if not paras:
            continue
        # Filter out English text.
        if ord(paras[0][0]) <= 0xFF or ord(paras[-1][-1]) <= 0xFF:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

示例#11

0

显示文件

文件： crawl_ny.py 项目： zhezhe123/corpuscrawler

def _crawl_mwnation_com(crawler, out):
    urls = set()
    index = crawler.fetch_content('http://mwnation.com/section/chichewa/')
    pages = re.findall(r'/section/chichewa/page/(\d+)/', index)
    num_pages = max([int(p) for p in pages])
    for page in range (1, num_pages + 1):
        url = 'http://mwnation.com/section/chichewa/'
        if page > 1:
            url += 'page/%d/' % page
        doc = crawler.fetch_content(url)
        urls.update(re.findall(r'<a href="([^"]+?)">Continue Reading', doc))
    for url in sorted(urls):
        doc = crawler.fetch_content(url)
        pubdate = re.search(
            r'<meta property="article:published_time" content="([^"]+)"', doc)
        pubdate = pubdate.group(1) if pubdate is not None else None
        title = extract('<h1 class="entry-title" itemprop="headline">',
                        '</h1>', doc) or ''
        body = extract('<div class="entry-content" itemprop="articleBody">',
                       '<footer ', doc) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        text = '\n'.join(paras) + '\n'
        if text.find(' the ') >= 0:  # likely English
            continue
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write(text)

示例#12

0

显示文件

文件： crawl_tpi.py 项目： wannaphong/corpuscrawler

def crawl_wantokniuspepa_com(crawler, out):
    sections = {
        'abc-pasifik-nius', 'bisnis-nius', 'helt-nius', 'komentri',
        'laip-stail', 'meri-nius', 'nius', 'wantok'
    }
    seeds = set()
    for section in sorted(sections):
        section_url = 'http://wantokniuspepa.com/index.php/%s' % section
        seeds.add(section_url)
        section_index = crawler.fetch(section_url)
        assert section_index.status == 200, (section_index.status, section_url)
        last_page = re.search('"End" href=".+?start=(\d+)" class="pagenav"',
                              section_index.content.decode('utf-8'))
        if last_page is not None:
            for page in range(1, int(last_page.group(1)) + 1):
                seeds.add('http://wantokniuspepa.com/index.php/%s?start=%d' %
                          (section, page))
    urls = set()
    for seed in sorted(seeds):
        doc = crawler.fetch(seed)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        for u in re.findall(r'(/index\.php/[^"]+?)"', content):
            p = u.split('/')
            if len(p) > 3 and p[1] == 'index.php' and p[2] in sections:
                if re.search(r'/\d{4,}', u) is not None:
                    urls.add('http://wantokniuspepa.com' + u.split('?')[0])
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        title = extract('<title>', '</title>', content)
        pubdate = re.search(
            r'<time datetime="([^T]+?)T([^"]+?)" '
            'itemprop="datePublished">', content)
        pubdate = cleantext(pubdate.group(1)) if pubdate else None
        body = extract('<div itemprop="articleBody">', '<ul class="pager',
                       content)
        if not body:
            continue
        body = body.split('<div class="clearfix"')[0]
        text = body.replace('\n', ' ')
        text = text.replace(' ,', ',').replace('“ ', '“')
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text)
        text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text)
        paras = [cleantext(p) for p in [title] + text.splitlines()]
        paras = filter(None, paras)
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#13

0

显示文件

def crawl_pl_usembassy_gov(crawler, out):
    sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml')
    trans_regex = re.compile(
        r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"'
    )
    pubdate_regex = re.compile(
        r'<meta property="article:published_time" content="([^"]*)"'
    )
    links = set()
    for key in sorted(sitemap.keys()):
        if _pl_usembassy_gov_path(key):
            links.add(key)
    for link in sorted(links):
        result = crawler.fetch(link)
        if result.status != 200:
            continue
        html = result.content.decode('utf-8')
        title = extract('<title>', '</title>', html)
        title = title if title else ''
        title = title.split(' | ')[0] if ' | ' in title else title
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        trans_match = trans_regex.search(html)
        trans = trans_match.group(1) if trans_match else None
        if pubdate is None: pubdate = result.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[link]
        exstart = '<div class="entry-content">'
        exstart2 = '<div class="mo-page-content">'
        exend = '<!-- AddThis Advanced Settings above via filter on the_content -->'
        exstart = exstart2 if exstart2 in html else exstart
        content = extract(exstart, exend, html)
        cleanparas = clean_paragraphs(content) if content else None
        # Don't repeat the title if it's the only text content
        cleantitle = cleantext(title)
        if cleanparas:
            if len(cleanparas) == 1 and cleanparas[0] == cleantitle:
                paras = [cleantitle]
            else:
                paras = [cleantitle] + cleanparas
        else:
            paras = [cleantitle]
        # There are quite a few media pages whose only text is the filename
        # this, conveniently, is typically also the post's name
        if len(paras) == 1 and paras[0].lower() in urlpath(link).lower():
            continue
        if paras:
            out.write('# Location: %s\n' % link)
            out.write('# Genre: Diplomatic\n')
            if trans:
                out.write('# Translation: %s\n' % trans)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#14

0

显示文件

def crawl_larenadomila_it(crawler):
    out = crawler.get_output(language='vec-u-sd-itvr')
    urls = find_urls_in_larenadomila_it(
        crawler, 'https://www.larenadomila.it/sito/index.php')
    for url in sorted(urls.difference(BLACKLISTED_URLS)):
        if url.find('&view=article&') < 0:
            continue
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, start_url)
        content = doc.content.decode('utf-8')
        title = cleantext(extract('<title>', '</title>', content))
        sections = [title] + [c.strip() for c in content.splitlines()]
        sections = [c for c in sections
                    if c.startswith('<div class="item_fulltext">')
                    or c.startswith('<p><span class="grassetto">')]
        sections = [c.replace(' <br />- ', ' ') for c in sections]
        text = '<br/>'.join(sections)
        text = text.replace('&nbsp;', ' ')  # used for spacing/formatting
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table)>', '\n', text)
        text = re.sub(r'<br\s*/?>', '\n', text)
        text = re.sub(r'\.{3,}', '… ', text)
        text = re.sub(r'\n(-)[^\s]', '- ', text)
        paras = filter(None, [cleantext(p) for p in text.split('\n')])
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('\n'.join(paras) + '\n')

示例#15

0

显示文件

文件： crawl_tt.py 项目： zhezhe123/corpuscrawler

def _crawl_vatantat_ru(crawler, out):
    index = crawler.fetch_content('http://www.vatantat.ru/')
    last = max([int(p) for p in re.findall(r'index\.php\?pg=(\d+?)"', index)])
    for page in range(2, last + 1):
        url = 'http://www.vatantat.ru/index.php?pg=%d' % page
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        html = extract('<p><span style="font-size: large;"><strong>',
                       '<span style="font-size: 80%; font-weight: bold;">',
                       content)
        if not html:
            continue
        html = html.split('(“Ватаным Татарстан”,')[0]
        html = html.split('<script>')[0]
        paras = clean_paragraphs(html)
        if not paras:
            continue
        pubdate = re.search(
            r'Татарстан”,&nbsp;&nbsp;&nbsp;/№&nbsp;(none|\d+),&nbsp;'
            r'(\d\d)\.(\d\d)\.(20\d\d)/', content)
        if pubdate is not None:
            pubdate = ('%s-%s-%s' %
                       (pubdate.group(4), pubdate.group(3), pubdate.group(2)))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#16

0

显示文件

文件： crawl_sah.py 项目： zhezhe123/corpuscrawler

def _crawl_kyym_ru(crawler, out):
    index = crawler.fetch_content('http://www.kyym.ru/')
    last = max([
        int(s) for s in re.findall(r'href="/index\.php\?start=(\d+?)"', index)
    ])
    urls = set()
    for page in range(1, last + 1):
        doc = crawler.fetch_content('http://www.kyym.ru/index.php?start=%d' %
                                    page)
        for path in re.findall(r'<a href="(/index\.php\?view=article&[^"]+?)"',
                               doc):
            urls.add('http://www.kyym.ru' + path.replace('&amp;', '&'))
    for url in sorted(urls):
        doc = crawler.fetch_content(url)
        html = extract('<div class="news_item_article">',
                       '<!--end news item -->', doc)
        if not html:
            continue
        paras = clean_paragraphs(html)
        if not paras:
            continue
        pubdate = re.search(
            r'<span class="createdate"><!-- date and by -->'
            r'\s*(\d{1,2}).(\d{2}).(20\d{2})',
            doc,
            flags=re.DOTALL)
        if pubdate is not None:
            pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2),
                                    pubdate.group(1))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

示例#17

0

显示文件

def _crawl_telegraaf_nl(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'http://www.telegraaf.nl/sitemap.xml',
        subsitemap_filter=_should_fetch_telegraaf_sitemap)
    for url in sorted(sitemap):
        doc = crawler.fetch(urlencode(url))
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search(
            r'<meta [a-zA-Z\-="]* property="og:title" content="(.+?)"', html)
        title = title.group(1) if title else ''
        pubdate = re.search(r'"publishDate":"([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else None
        text = extract(
            'data-element="ArticlePage-intro">',
            '<div class="flex" data-element="ArticlePage-socialShare-root">',
            html) or ''
        paras = clean_paragraphs(title + '<br/>' + text)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#18

0

显示文件

文件： crawl_hy.py 项目： zhezhe123/corpuscrawler

def crawl_azg_am(crawler, out):
    urls = set()
    for d in daterange(date(2001, 1, 9), date.today()):
        datestr = '%04d%02d%02d00' % (d.year, d.month, d.day)
        url = 'http://www.azg.am/AM/%s' % datestr
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        articles = [
            a for a in re.findall(r'20\d{8}', content) if not a.endswith('00')
        ]
        for a in articles:
            urls.add('http://www.azg.am/wap/?nl=AM&id=%s&Base_PUB=0' % a)
        print(len(urls))
    for url in sorted(urls):
        pubdate = re.search(r'id=(20\d{6})', url).group(1)
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        text = extract('<hr>', '<hr>', content)
        text = text.replace('\n', ' ')
        text = re.sub('</(p|h[1-9]|div)>', '\n', text)
        paras = filter(None, [cleantext(p) for p in text.splitlines()])
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Publication-Date: %s-%s-%s\n' %
                      (pubdate[:4], pubdate[4:6], pubdate[6:8]))
            out.write('\n'.join(paras) + '\n')

示例#19

0

显示文件

文件： crawl_oc.py 项目： zhezhe123/corpuscrawler

def _crawl_jornalet_com(crawler, out):
    for url in sorted(_find_urls_jornalet_com(crawler)):
        try:
            html = crawler.fetch_content(url)
        except UnicodeDecodeError:
            continue
        title = re.search(r'<meta property="og:title" content="([^"]+)"', html)
        title = title.group(1) if title else ''
        subtitle = extract('<h4 class="subtitol">', '</h4>', html) or ''
        content = extract('<p class="contingut">', '<hr', html) or ''
        paras = clean_paragraphs('\n'.join(
            ['<p>%s</p>' % p for p in (title, subtitle, content) if p]))
        paras = [p for p in paras if p.find('Abonar los amics de Jornalet') < 0]
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        out.write('\n'.join(paras) + '\n')

示例#20

0

显示文件

def crawl_coislife_ie(crawler, out):
    links = set()
    for num in range(1, 12):
        if num > 1:
            listurl = 'https://www.coislife.ie/product-category/ga/page/%s/' % num
        else:
            listurl = 'https://www.coislife.ie/product-category/ga/'
        idxres = crawler.fetch(listurl)
        if idxres.status != 200:
            continue
        idxhtml = idxres.content.decode('utf-8')
        index = extract('<div class="products-archive--products">',
                        '<nav class="woocommerce-pagination">', idxhtml)
        for link in re.findall(
                r'<a href="(https://www.coislife.ie/product/[^"]+?)">', index):
            links.add(link)
    for url in sorted(links):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('&#8211;')[0].strip() if title else ''
        desc = re.search(r'<meta property="og:description" content="([^"]+?)"',
                         html)
        desc = cleantext(desc.group(1))
        body = extract(
            '<div class="tab-content">',
            '<div class="entry-content in fade tab-pane" id="tab-additional_information">',
            html) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        pubdate = fetchresult.headers.get('Last-Modified')
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Commerce\n')
            if desc:
                out.write('# Description: %s\n' % desc)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            for para in paras:
                if para.find('Léigh sliocht as an leabhar') >= 0:
                    continue
                else:
                    out.write(para + '\n')

示例#21

0

显示文件

文件： crawl_pa.py 项目： zhezhe123/corpuscrawler

def crawl_jagbani_punjabkesari_in(crawler, out):
    urls = set()
    main = crawler.fetch('http://jagbani.punjabkesari.in/')
    assert main.status == 200, main.status
    menu = extract('<nav id="menu" class="menu">', '</nav>',
                   main.content.decode('utf-8'))
    urls_re = re.compile(r'href="(https?://jagbani\.punjabkesari\.in/[^"]+?)"')
    category_urls = urls_re.findall(menu)
    for category_url in sorted(set([x.strip() for x in category_urls])):
        for page in range(1, 1000):
            doc = crawler.fetch(category_url + '/page/%d' % page)
            content = doc.content.decode('utf-8') if doc.status == 200 else ''
            if content.find('class="story"') < 0:
                break
            for u in urls_re.findall(
                    extract('<span class="story">', '<div class="kjpage"',
                            content)):
                urls.add(urlencode(u.strip()))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            content = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        title = extract('<title>', '</title>', content)
        text = extract('<article>', '</article>', content)
        if not text:
            continue
        text = re.sub(r'<br[^a-zA-Z][^>]*>', '<br>', text)
        text = text.replace('\n', ' ').replace('<br>', '\n')
        paras = [title] + text.splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        pubdate = re.search(
            '<meta property="article:published_time" content="([^"]+?)"',
            content)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write(('\n'.join(paras) + '\n'))

示例#22

0

显示文件

def crawl_peig_ie(crawler, out):
    crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1_2))
    sitemap = crawler.fetch_sitemap('https://peig.ie/sitemap_index.xml',
                                    subsitemap_filter=_peig_filter_robots)

    def peig_cat(page):
        if page.find('/imeachtai/') >= 0:
            return 'Events'
        elif page.find('peig.ie/20') >= 0:
            return 'News'
        elif page.find('/fol%C3%BAntais/') >= 0:
            return 'Job listings'
        else:
            return ''

    # Peig.ie has a lot of posts from other sites
    def skip_page(site):
        if site.find('//nos.ie/') >= 0:
            return True
        elif site.find('//tuairisc.ie/') >= 0:
            return True
        elif site.find('//meoneile.ie/') >= 0:
            return True
        else:
            return False

    for url in sorted(sitemap.keys()):
        if url == 'https://peig.ie/':
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('|')[0].strip() if title else ''
        read_more = re.search(r'<a.*href="([^"]+")[^>]*>Níos mó</a>', html)
        if read_more and skip_page(read_more.group(1)):
            continue
        if '<meta property="article:modified_time"' in html:
            date = re.search(
                r'<meta property="article:modified_time" content="([^"]+)"',
                html).group(1)
        else:
            date = re.search(r'"datePublished":"([^"]+)"', html).group(1)
        body = extract('<div class="uk-margin-medium-top" property="text">',
                       '<ul class="uk-pagination', html) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        genre = peig_cat(url)
        if paras:
            out.write('# Location: %s\n' % url)
            if genre:
                out.write('# Genre: %s\n' % genre)
            if date:
                out.write('# Publication-Date: %s\n' % date)
            out.write('\n'.join(paras) + '\n')
    crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1))

示例#23

0

显示文件

def crawl_wikisource_trieste_vernacola(crawler):
    out = crawler.get_output(language='vec-u-sd-itts')
    urls = set()
    index = crawler.fetch(
        'https://vec.wikisource.org/wiki/Indice:Trieste_vernacola.djvu')
    assert index.status == 200, index.status
    remarks = extract('<div id="remarks">', 'Colombe</a>',
                      index.content.decode('utf-8'))
    for urlpath in sorted(set(re.findall(r'href="(/wiki/[^"]+)"', remarks))):
        if not urlpath.startswith('/wiki/Trieste_vernacola/'):
            urls.add('https://vec.wikisource.org' + urlpath)
    for url in sorted(urls.difference(BLACKLISTED_URLS)):
        doc = crawler.fetch(url)
        assert doc.status == 200, (doc.status, url)
        content = doc.content.decode('utf-8')
        text = extract('<div id="scatola" class="testo">', '<noscript>',
                       content)
        text = text.split('<dt>Note</dt>')[0].split('<dl>')[0]
        text = text.replace('\n', ' ')
        text = re.sub(r'<sup.+?</sup>', '', text)
        text = text.replace('&#160;', ' ')  # NBSP used for spacing
        text = text.replace("'", "’")
        text = re.sub(r'<!--.+?-->', '', text, flags=re.DOTALL)
        text = re.sub(r' alt="[^"]+"', ' ', text, flags=re.DOTALL)
        text = re.sub(r'<span class="numeroriga".+?</span>', '', text)
        text = re.sub(r'</(?:div|DIV|p|P|[hH][1-6]|table|TABLE)>', '\n', text)
        text = re.sub(r'<(?:br|BR)\s*/?>', '\n', text)
        lines = [l for l in text.splitlines()
                 if l.find('noprint') < 0 and l.find('font-size:smaller') < 0]
        text = '\n'.join([cleantext(l) for l in lines])
        text = re.sub('\n{2,}', '<p>', text).replace('\n', ' | ')
        text = text.replace('<p>', '\n')
        paras = filter(None, [' '.join(p.split()) for p in text.splitlines()])
        if not paras:
            continue
        # The book, published in 1920, is a collection of earlier lyrics.
        pubyear = re.search(r'<span id="ws-year">(\d{4})</span>', content)
        pubyear = int(pubyear.group(1)) if pubyear else 1920
        out.write('# Location: %s\n' % url)
        out.write('# Genre: Lyrics\n')
        out.write('# Publication-Date: %d\n' % pubyear)
        out.write('\n'.join(paras) + '\n')

示例#24

0

显示文件

文件： crawl_sat.py 项目： zhezhe123/corpuscrawler

def _crawl_asymptotejournal_com(crawler, out):
    url = ('https://www.asymptotejournal.com/nonfiction/'
           'shibu-tudu-memories-of-the-kirta-dangra/santhali/')
    html = crawler.fetch_content(url)
    content = extract('<!-- article content -->',
                      '<img src="/images/end-logo-black.gif"', html)
    out.write('# Location: %s\n' % url)
    out.write('# Genre: Fiction\n')
    paras = clean_paragraphs(content)
    paras = [p for p in paras if p[0] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
    out.write('\n'.join(paras) + '\n')

示例#25

0

显示文件

def crawl_forasnagaeilge_ie(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'https://www.forasnagaeilge.ie/sitemap_index.xml')
    pubdate_regex = re.compile(r'"datePublished":"([^"]+)",')
    for url in sorted(sitemap.keys()):
        orig_url = url
        if '?lang=en' in url:
            ga_url = url.replace('?lang=en', '')
            if ga_url in sitemap.keys():
                continue
        if '/blog-en/' in url:
            continue
        if '/corporate-information/' in url:
            continue
        if '/torthai-cuardaigh/' in url:
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        if '<html class="no-js" lang="en">' in html:
            continue
        title = extract('<title>', ' - www.forasnagaeilge.ie</title>',
                        html) or ''
        pubdate_match = pubdate_regex.search(html)
        if pubdate_match:
            pubdate = pubdate_match.group(1)
        else:
            pubdate = sitemap.get(url) or sitemap[orig_url]
        body = extract('<div id="main" class="container">',
                       '</div><!-- /.content -->', html)
        if not body:
            continue
        paras = clean_paragraphs(body)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Title: %s\n' % title)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#26

0

显示文件

def _crawl_raestdzinad_ru(crawler, out):
    urls = crawler.fetch_sitemap(
        urlencode('https://растдзинад.рф/sitemap_index.xml'))
    for url in sorted(urls):
        if re.search(r'/20\d{2}/', url) is None:
            continue
        html = crawler.fetch_content(url)
        title = extract('<h1 class="entry-title">', '</h1>', html) or ''
        text = extract('<div class="td-post-content">', '<footer>', html) or ''
        text = text.split('<div class = "evc-social-likes"')[0]
        pubdate = re.search(
            r'<meta property="article:published_time" content="([^"]+)"', html)
        if pubdate:
            pubdate = pubdate.group(1)
        paras = clean_paragraphs('%s<p/>%s' % (title, text))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#27

0

显示文件

def _crawl_eestikirik_ee(crawler, out):
    for url in sorted(_find_urls_eestikirik_ee(crawler)):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = extract('<h1 class="entry_title">', '</h1>', html) or ''
        entry = extract('<div class="entry">', '<div style="min-height:33px;"',
                        html) or ''
        pubdate = re.search('(\d{1,2})\.(\d{1,2})\.(20\d{2})',
                            extract('<div id="content">', '</small>', html))
        if pubdate is not None:
            pubdate = '%04d-%02d-%02d' % (int(
                pubdate.group(3)), int(pubdate.group(2)), int(
                    pubdate.group(1)))
        paras = clean_paragraphs('%s<br/>%s' % (title, entry))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

示例#28

0

显示文件

文件： crawl_ca_valencia.py 项目： zhezhe123/corpuscrawler

def _crawl_val_levante_emv_com(crawler, out):
    urls = set()
    for url in crawler.fetch_sitemap('http://val.levante-emv.com/sitemap.xml'):
        url = url.replace('//www.levante-emv.com', '//val.levante-emv.com')
        if re.search(r'/\d{4}/\d{2}/\d{2}/', url) is not None:
            urls.add(url)
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        es_url = url.replace('//val.levante-emv.com', '//www.levante-emv.com')
        html = doc.content.decode('utf-8')
        pubdate = re.search(
            r'<meta name="cXenseParse:recs:publishtime" content="([^"]+)"',
            html)
        pubdate = pubdate.group(1) if pubdate else None
        title = extract('<span itemprop="articleBody">', '</h1>', html)
        subtitle = extract('<h2 itemprop="description">', '</h2>', html)
        content = extract('<span itemprop="articleBody">',
                          '</apertium-notrans>', html)
        paras = clean_paragraphs(''.join(
            ['<p>%s</p>' % p for p in (title, subtitle, content) if p]))
        text = '\n'.join(paras)
        for sep in ['Compartir en Twitter', 'HEMEROTECA\n', '\nPublicitat\n']:
            text = text.split(sep)[0].strip()
        if not text:
            continue
        if any(b in text for b in [
                'inicia sessió si eres subscriptor',
                'Si eres subscriptor inicia sessió',
                'Para continuar leyendo... suscríbete'
        ]):
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Translation.es: %s\n' % es_url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write(text + '\n')

示例#29

0

显示文件

文件： crawl_my_t_d0_zawgyi.py 项目： zhezhe123/corpuscrawler

def _crawl_than_lwin_times(crawler, out):
    urls = find_wordpress_urls(crawler, 'http://thanlwintimes.com/')
    for url in sorted(urls):
        if not url.endswith('/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(
            r'<time class="entry-date updated td-module-date" '
            r'datetime="([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else ''
        title = (extract('<title>', '</title>', html) or '').split('|')[0]
        body = extract('<div class="td-post-content">',
                       "<div class='sfsi_Sicons'", html) or ''
        body = body.split('Please follow and like us')[0]
        paragraphs = clean_paragraphs('%s<br/>%s' % (title, body))
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paragraphs) + '\n')

示例#30

0

显示文件

def _crawl_svaboda_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.svaboda.org/sitemap.xml')
    for url in sorted(sitemap):
        if (url == 'https://www.svaboda.org/' or
                url.startswith('https://www.svaboda.org/z/')):  # index pages
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = extract('<title>', '</title>', html) or ''
        pubdate = re.search(
            r'<div class="published">\s*<span class="date"\s*>'
            r'\s*<time datetime="([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else None
        body = extract('<div class="body-container">', '<div id="comments"',
                       html) or ''
        paras = clean_paragraphs('%s<p/>%s' % (title, body))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')