Exemplo n.º 1
0
def html2article(html, url, selector=False, merge=False, **options):
    extractor = ArticleExtractor(html, url, **options)
    article = extractor.article
    if article is not None and selector:
        article.update(extractor.selector)

    if article is not None and article['pages'] and merge:
        article['content'] = ArticleMerger(
            url, extractor.title,
            fetch_urls(extractor.pages, handle=get_or_cache),
            **extractor.selector).content
    return article
Exemplo n.º 2
0
def html2article(html, url, selector=False, merge=False, **options):
	extractor = ArticleExtractor(html, url, **options)
	article = extractor.article
	if article is not None and selector:
		article.update(extractor.selector)

	if article is not None and article['pages'] and merge:
		article['content'] = ArticleMerger(
			url,
			extractor.title, 
			fetch_urls(extractor.pages, handle=get_or_cache),
			**extractor.selector
		).content
	return article
Exemplo n.º 3
0
async def echo(event):
    try:
        urls = fetch_urls(event.text)
        for url in urls:
            logging.info(url)
            await page.goto(url)

            file_name = f'{time.time()}.png'
            await page.screenshot(path=file_name, fullPage=False)
            await event.reply(event.text, file=file_name)
            os.remove(file_name)

    except Exception as err:
        await event.reply(event.text)
        await event.respond(str(err)[:2000])
        logging.exception(err)
        return
Exemplo n.º 4
0
def test_article():
    debug = True if request.args.get('debug') == 'true' else False
    url = request.args.get('url', '')
    if not url.startswith('http://'):
        return 'url is not startswith http://'
    add_test_url(url)
    html = get_or_cache(url, print_path=True)
    extractor = ArticleExtractor(html, url, debug=debug)
    article = extractor.article
    selector = extractor.selector
    if extractor.pages and article:
        article['content'] = ArticleMerger(url,
                                           extractor.title,
                                           fetch_urls(extractor.pages,
                                                      handle=get_or_cache),
                                           debug=debug,
                                           **selector).content
    return json.dumps({'url': url, 'article': article, 'selector': selector})
Exemplo n.º 5
0
def test_article():
	debug = True if request.args.get('debug') == 'true' else False
	url = request.args.get('url', '')
	if not url.startswith('http://'):
		return 'url is not startswith http://'
	add_test_url(url)
	html = get_or_cache(url, print_path=True)
	extractor = ArticleExtractor(html, url, debug=debug)
	article = extractor.article
	selector = extractor.selector
	if extractor.pages and article:
		article['content'] = ArticleMerger(
			url,
			extractor.title, 
			fetch_urls(extractor.pages, handle=get_or_cache),
			debug=debug,
			**selector
		).content
	return json.dumps({'url':url, 'article':article, 'selector':selector})
Exemplo n.º 6
0
def test_segment(url):
	url = url.split('#')[0].split('?')[0]
	if not url.startswith('http://'):
		return 'url is not startswith http://'
	add_test_url(url)
	html = get_or_cache(url)
	extractor = ArticleExtractor(html, url)
	content = extractor.content
	if extractor.pages:
		content = ArticleMerger(
			url, 
			extractor.title, 
			fetch_urls(extractor.pages, handle=get_or_cache),
			**extractor.selector
		).content
	return json.dumps({
		'url':url, 
		'title': extractor.title,
		'words': segmentor.seg(extractor.title, html2text(content))
	})
Exemplo n.º 7
0
def test_segment_all():
	urls = get_test_urls()
	res = []
	for url in urls:
		html = get_or_cache(url)
		extractor = ArticleExtractor(html, url)
		content = extractor.content
		if extractor.pages:
			content = ArticleMerger(
				url, 
				extractor.title, 
				fetch_urls(extractor.pages, handle=get_or_cache),
				**extractor.selector
			).content
		res.append({
			'url':url, 
			'title': extractor.title,
			'words': segmentor.seg(extractor.title, html2text(content, code=False))
		})
	return json.dumps(res)
Exemplo n.º 8
0
def test_segment(url):
    url = url.split('#')[0].split('?')[0]
    if not url.startswith('http://'):
        return 'url is not startswith http://'
    add_test_url(url)
    html = get_or_cache(url)
    extractor = ArticleExtractor(html, url)
    content = extractor.content
    if extractor.pages:
        content = ArticleMerger(
            url, extractor.title,
            fetch_urls(extractor.pages, handle=get_or_cache),
            **extractor.selector).content
    return json.dumps({
        'url':
        url,
        'title':
        extractor.title,
        'words':
        segmentor.seg(extractor.title, html2text(content))
    })
Exemplo n.º 9
0
def test_segment_all():
    urls = get_test_urls()
    res = []
    for url in urls:
        html = get_or_cache(url)
        extractor = ArticleExtractor(html, url)
        content = extractor.content
        if extractor.pages:
            content = ArticleMerger(
                url, extractor.title,
                fetch_urls(extractor.pages, handle=get_or_cache),
                **extractor.selector).content
        res.append({
            'url':
            url,
            'title':
            extractor.title,
            'words':
            segmentor.seg(extractor.title, html2text(content, code=False))
        })
    return json.dumps(res)