Пример #1
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('Keskisuomalainen', url,
                                           r.status_code, [u''], [u''], u'',
                                           u'', u'', u'', [u''], [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(role='main')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article__published'))
    author = processor.collect_text(article.find(class_='article__author'))
    title = processor.collect_text(article.find(class_='article__title'))
    ingress = processor.collect_text(article.find(class_='article__summary'))
    text = processor.collect_text(article.find(class_='article__body'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='article__images'), '')
    captions = processor.collect_image_captions(
        article.find_all(itemprop='caption description'))

    return processor.create_dictionary('Keskisuomalainen', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Пример #2
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'content__wrapper' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'typography__category' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'meta-content' ) )
	author = processor.collect_text( article.find( class_ = 'typography__author' ) )
	title = processor.collect_text( article.find( class_ = 'content__title' ) )
	ingress = processor.collect_text( article.find( class_ = 'content__intro' ) )
	text = processor.collect_text( article.find( class_ = 'content__body' ))
	images = processor.collect_images_by_parent( article.find_all( class_ = 'content__main-gallery' ), '' )

	captions = [None]
	for caption_div in article.find_all( class_ = 'content__main-gallery' ):
		caption = BeautifulSoup( caption_div.find( 'a' )['data-caption'], "html.parser" )
		captions.append( processor.collect_text( caption ) )
	captions.pop(0)

	return processor.create_dictionary('Seura', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
Пример #3
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	departments = article.find( class_ = 'field-name-field-department-tref' )
	categories = processor.collect_categories( departments.find_all( 'a' ) )

	datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) )

	author = article.find( class_ = 'author' )
	if author != None:
		processor.decompose( author.find( class_ = 'img' ) )
		author = processor.collect_text( author.find( 'h3' ) )
	else:
		author = u''

	title = processor.collect_text( article.find( 'h1' ) )
	text = processor.collect_text( article.find( class_ = 'field field-name-body' ) )
	images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '')
	captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) )

	return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
Пример #4
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(id='sp-component')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    meta = article.find(class_='category_date')

    categories = processor.collect_categories(meta.find_all('a'))
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(article.find(class_='author_credits'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='itemIntroText'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='itemImage'), 'https://www.karjalainen.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='itemImageCaption'))

    return processor.create_dictionary('Karjalainen', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Пример #5
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='ad'))
    processor.decompose_all(article.find_all(class_='ad-container'))
    processor.decompose_all(article.find_all('style'))
    processor.decompose(article.find(id='fullWidthBottom'))

    categories = processor.collect_categories(
        article.find_all(class_='article-category'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(class_='author-name'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='lead-paragraph'))
    text = processor.collect_text(article.find(class_='editorial'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='img-container'), 'http:')
    captions = processor.collect_image_captions(
        article.find_all(class_='figcaption'))

    return processor.create_dictionary('Mtv', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)