示例#1
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='single-article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-links'))
    processor.decompose_all(article.find_all(class_='article-ad-block'))

    categories = processor.collect_categories(
        soup.find(class_='section-title'))
    datetime_list = processor.collect_datetime(article.find('time'))
    author = processor.collect_text(article.find(class_='byline'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='ingress'))
    text = processor.collect_text(article.find(class_='body'))
    images = processor.collect_images(article.find_all('img'), 'src', 'http:')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Taloussanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
示例#2
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('main')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='nosto'))

    links = article.find(class_='links')
    categories = processor.collect_categories(links.find_all('li'))

    datetime_list = processor.collect_datetime(
        article.find(class_='field-name-field-publish-date'))
    author = processor.collect_text(article.find(class_='tekija'))
    title = processor.collect_text(article.find(id='page-title'))
    text = processor.collect_text(article.find(class_='body'))
    images = processor.collect_images(
        article.find(class_='views-field-field-op-main-image').find_all('img'),
        'src', '')

    return processor.create_dictionary('Tiedonantaja', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, [u''])
示例#3
0
def parse(url):

    api_path = 'http://yle.fi/ylex/api/article/'
    _id = url.split('/')[-1]

    r = requests.get(api_path + _id)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'

    json = r.json()

    categories = [processor.process(json['homesection']['name'])]
    datetime_list = processor.collect_datetime_json(json, 'datePublished',
                                                    'dateModified')
    author = processor.process(json['authors'][0]['name'])
    title = processor.process(json['title'])
    ingress = processor.process(json['lead'])

    text_html = BeautifulSoup(json['html'], "html.parser")
    text = processor.collect_text(text_html)

    if 'image' in json:
        image_json = json['image']
        images = [image_json['uri']]
        captions = [image_json['alt']]
    else:
        images, captions = [u''], [u'']

    return processor.create_dictionary('Yle X', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
示例#4
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'content__wrapper' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'typography__category' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'meta-content' ) )
	author = processor.collect_text( article.find( class_ = 'typography__author' ) )
	title = processor.collect_text( article.find( class_ = 'content__title' ) )
	ingress = processor.collect_text( article.find( class_ = 'content__intro' ) )
	text = processor.collect_text( article.find( class_ = 'content__body' ))
	images = processor.collect_images_by_parent( article.find_all( class_ = 'content__main-gallery' ), '' )

	captions = [None]
	for caption_div in article.find_all( class_ = 'content__main-gallery' ):
		caption = BeautifulSoup( caption_div.find( 'a' )['data-caption'], "html.parser" )
		captions.append( processor.collect_text( caption ) )
	captions.pop(0)

	return processor.create_dictionary('Seura', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
示例#5
0
def parse_from_archive(url, content):

    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Kainuun sanomat', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    datetime_list = processor.collect_datetime(meta)

    categories = [processor.collect_text(meta).split(',')[1].strip()]

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
    text = text.strip()

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary('Kainuun sanomat', url, 200, categories,
                                       datetime_list, author, title, u'', text,
                                       [u''], captions)
示例#6
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'region-content-inner' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find_all( 'noscript' ) )
	processor.decompose( article.find( id = 'comments' ) )
	processor.decompose( article.find( class_ = 'contributor' ) )
	processor.decompose( article.find( class_ = 'field-name-field-author-image' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'field-name-field-category' ) )
	datetime_list = processor.collect_datetime_objects( article.find_all( class_ = 'date-display-single' ), 'content' )
	author = processor.collect_text( article.find( class_ = 'author-name' ) )
	title = processor.collect_text( article.find( id = 'page-title' ) )
	text = processor.collect_text( article.find( class_ = 'field-name-body' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'field-name-field-image-description' ) )

	return processor.create_dictionary('Uusi Suomi', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
示例#7
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'iso-8859-1'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( id = 'container_keski' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose( article.find( class_ = 'kp-share-area' ) )

	categories = processor.collect_categories( soup.find_all( class_ = 'sel' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'juttuaika' ) )

	author_div = article.find( class_ = 'author' )
	processor.decompose( author_div.find( 'a' ) )
	author = processor.collect_text( author_div, True )

	title = processor.collect_text( article.find( 'h1' ) )
	ingress = processor.collect_text( article.find( class_ = 'ingressi' ), True )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'kuvateksti' ) )

	processor.decompose_all( article.find_all( class_ = 'kuvamiddle' ) )

	text = processor.collect_text( article.find( 'isense' ) )

	return processor.create_dictionary('Iltalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	title = processor.collect_text( article.find( class_ = 'entry-title' ) )

	url_elements = url.split('/')
	year = url_elements[4]
	month = url_elements[5]
	day = url_elements[6]
	datetime_list = [datetime.date(datetime.strptime(day + '.' + month + '.' + year, "%d.%m.%Y"))]

	author = processor.collect_text( article.find( class_ = 'author vcard' ) )
	text = processor.collect_text( article.find( class_ = 'entry-content' ) )

	return processor.create_dictionary('Iltalehti Blogit', url, r.status_code, [u''], datetime_list, author, title, u'', text, [u''], [u''])
示例#9
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(id='main-content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='reviewpic'))

    datetime_list = processor.collect_datetime(
        article.find(class_='published'))
    author = processor.collect_text(article.find(class_='author'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='entry-content'))
    images = processor.collect_images(article.find_all('img'), 'src', '')

    return processor.create_dictionary('Faktabaari', url, r.status_code, [u''],
                                       datetime_list, author, title, u'', text,
                                       images, [u''])
示例#10
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find( 'header' ).find_all( 'img' ) )
	processor.decompose_all( article.find_all( 'blockquote' ) )
	processor.decompose( article.find( class_ = "meta-sidebar" ) )

	categories = processor.collect_categories( article.find_all( class_ = 'cat' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'date' ) )
	author = processor.collect_text( article.find( class_ = 'author' ) )
	title = processor.collect_text( article.find( class_ = 'article-title' ) )
	ingress = processor.collect_text( article.find( class_ = 'ingress' ), True )
	text = processor.collect_text( article.find( class_ = 'content' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'featured-image' ) )

	return processor.create_dictionary('Kd-lehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
示例#11
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(id='sp-component')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    meta = article.find(class_='category_date')

    categories = processor.collect_categories(meta.find_all('a'))
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(article.find(class_='author_credits'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='itemIntroText'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='itemImage'), 'https://www.karjalainen.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='itemImageCaption'))

    return processor.create_dictionary('Karjalainen', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
示例#12
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'mainArticle-content-wrapper' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	header = article.find( id = 'main-article-header' )
	categories = processor.collect_categories( header.find_all( class_ = 'section' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'article-date' ) )
	author = processor.collect_text( article.find( class_ = 'authorName' ) )
	title = processor.collect_text( article.find( class_ = 'main-article-header' ) )
	text = processor.collect_text( article.find( class_ = 'body' ) )

	processor.decompose( article.find( class_ = 'authorPicture' ) )
	processor.decompose( article.find( id = 'main-subscribe' ) )

	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'main-media-caption' ) )

	return processor.create_dictionary('Etelä-Suomen Sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
示例#13
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	departments = article.find( class_ = 'field-name-field-department-tref' )
	categories = processor.collect_categories( departments.find_all( 'a' ) )

	datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) )

	author = article.find( class_ = 'author' )
	if author != None:
		processor.decompose( author.find( class_ = 'img' ) )
		author = processor.collect_text( author.find( 'h3' ) )
	else:
		author = u''

	title = processor.collect_text( article.find( 'h1' ) )
	text = processor.collect_text( article.find( class_ = 'field field-name-body' ) )
	images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '')
	captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) )

	return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
示例#14
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('Keskisuomalainen', url,
                                           r.status_code, [u''], [u''], u'',
                                           u'', u'', u'', [u''], [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(role='main')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article__published'))
    author = processor.collect_text(article.find(class_='article__author'))
    title = processor.collect_text(article.find(class_='article__title'))
    ingress = processor.collect_text(article.find(class_='article__summary'))
    text = processor.collect_text(article.find(class_='article__body'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='article__images'), '')
    captions = processor.collect_image_captions(
        article.find_all(itemprop='caption description'))

    return processor.create_dictionary('Keskisuomalainen', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
示例#15
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find_all( class_ = 'somebar' ) )
	processor.decompose( article.find( class_ = 'tags' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'post-category' ) )

	datetime_string = article.find( class_ = 'timestamp' ).get_text( ' ', strip = True )
	datetime_string = processor.convert_month( datetime_string.replace( ',', '' ) )
	datetime_list = [datetime.strptime( datetime_string, '%m %d %Y %H:%M' )]

	author = processor.collect_text( article.find( class_ = 'article-page-writer' ), True )
	title = processor.collect_text( article.find( class_ = 'post-title' ) )
	text = processor.collect_text( article.find( class_ = 'post-content' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( 'figcaption' ) )

	return processor.create_dictionary('Suomen uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
示例#16
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'main-content-area' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	meta = article.find( class_ = 'post-meta' )

	categories = processor.collect_categories( meta.find_all( class_ = 'category' ), True )
	datetime_list = processor.collect_datetime( meta, 'datetime date' )
	author = processor.collect_text( article.find( class_ = 'author--main' ) )
	title = processor.collect_text( article.find( class_ = 'heading--main' ) )
	ingress= processor.collect_text( article.find( class_ = 'heading--secondary' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) )

	processor.decompose_all( article.find_all( class_ = 'image-wrapper' ) )

	text = processor.collect_text( article.find( class_ = 'content--main' ) )

	return processor.create_dictionary('Lapin kansa', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
示例#17
0
def parse(url):

    api_path = 'https://www.kauppalehti.fi/api/news/article/'
    _id = url.split('/')[-1]

    r = requests.get(api_path + _id)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'

    json = r.json()

    categories = [processor.process(json['mainCategory']['name'])]
    datetime_list = processor.collect_datetime_json(json, 'published',
                                                    'modified')
    author = processor.process(json['byline'][0])
    title = processor.process(json['title'])
    ingress = processor.process(json['headline'])

    text_html = BeautifulSoup(json['body'], "html.parser")
    text = processor.collect_text(text_html)

    if 'keyImage' in json:
        image_url = 'http://images.kauppalehti.fi/547x/http:' + json['keyImage']
        images = [image_url]
    else:
        images = [u'']

    return processor.create_dictionary('Kauppalehti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, [u''])
示例#18
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose(article.find(class_='sticky-inner-wrapper'))

    categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1]
    categories = processor.collect_categories(categories_list)

    datetime_list = processor.collect_datetime(article.find(class_='meta'))

    authors = article.find(class_='authors')
    author = ''
    for div in authors.find_all(class_='author'):
        author += processor.collect_text(div.find('p')) + ','
    author = author[:-1]

    processor.decompose(authors)

    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='lead'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    processor.decompose(article.find(class_='sticky-outer-wrapper active'))
    processor.decompose(article.find('header'))
    processor.decompose(article.find('footer'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Kouvolan sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
示例#19
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")
    processor.decompose_all(soup.find_all(class_='pohja'))
    processor.decompose_all(soup.find_all(class_='footer_left'))
    processor.decompose_all(soup.find_all(class_='keski_footer'))
    processor.decompose_all(soup.find_all(class_='right_footer'))
    processor.decompose_all(soup.find_all(class_='sitaatti'))

    categories = [processor.collect_text(soup.find(class_='vinjetti'))]
    processor.decompose_all(soup.find_all(class_='vinjetti'))

    datetime_list = processor.collect_datetime(
        soup.find(class_='datetime').parent.parent)
    datetime_list.reverse()

    author = processor.collect_text(soup.find(class_='text-editor'))
    title = processor.collect_text(soup.find(class_='otsikko'))
    ingress = processor.collect_text(soup.find(class_='alarivi'))

    processor.decompose_all(soup.find_all(class_='alarivi'))

    text = ''
    for paragraph in soup.find_all(class_='teksti'):
        paragraph_text = processor.collect_text(paragraph)
        if paragraph_text not in text:
            text = text + ' ' + paragraph_text
    text = text.strip()

    img_div = soup.find(class_='pikkukuva')
    images = [u'']
    captions = [u'']
    if img_div:
        header_img = img_div.find_all('img')
        images = processor.collect_images(header_img, 'data-aghref',
                                          'http://www.suomenmaa.fi/')
        captions = [header_img[0]['alt']]

    return processor.create_dictionary('Suomenmaa', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
示例#20
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article__full')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article__meta__category'))

    title = processor.collect_text(article.find(class_='medium-title'))

    datetime_list = processor.collect_datetime(
        article.find(class_='article__meta__timestamp'))

    author = processor.collect_text(article.find(class_='author__name'))
    ingress = processor.collect_text(article.find(class_='lead'))

    text = ''
    for string in article.find_all('p'):
        text += ' ' + processor.collect_text(string)
    text = text.strip()

    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.ilkka.fi')

    captions = []
    for caption_element in article.find_all(
            lambda tag: tag.name == 'a' and 'data-caption' in tag.attrs):
        captions.append(caption_element['data-caption'])

    return processor.create_dictionary('Ilkka', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
示例#21
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='node-wrap')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose_all(article.find_all(class_='tyrkkyBox'))
    processor.decompose(article.find(class_='avainsanat'))
    processor.decompose(article.find(class_='twitter-share-button'))
    processor.decompose(article.find(class_='fb-like'))
    processor.decompose(article.find(class_='moreLanka'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose(article.find('cite'))

    meta = article.find(class_='juttutiedot')
    datetime_list = processor.collect_datetime(meta, )
    author = processor.collect_text(meta.find(class_='author'))
    processor.decompose(meta)

    title = processor.collect_text(article.find('h2'), True)
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='kuvaTekstiIso'))

    processor.decompose_all(article.find_all(class_='kuvaTekstiIso'))
    processor.decompose_all(article.find_all('figcaption'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Vihreä lanka', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       u'', text, images, captions)
示例#22
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='region bottom'))
    processor.decompose(
        article.find(class_='field-name-field-related-content'))

    categories = processor.collect_categories(
        article.find_all(class_='field-name-field-category'))
    datetime_list = processor.collect_datetime(
        article.find(class_='field-name-post-date'), 'timedate')
    author = processor.collect_text(
        article.find(class_='field-name-field-author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(
        article.find(class_='field-name-field-summary'))
    text = processor.collect_text(article.find(class_='field-name-field-body'))

    images = []
    for img in processor.collect_images(article.find_all('img'), 'src', ''):
        if 'placeholder' not in img:
            images.append(img)

    captions = processor.collect_image_captions(
        article.find_all(class_='file-image-description-caption'))

    return processor.create_dictionary('Hyvä terveys', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
示例#23
0
def parse_from_archive(url, content):

    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Kauppalehti', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    domain = 'Kauppalehti'
    if 'online' in meta.text:
        domain += ' Online'

    datetime_list = processor.collect_datetime(meta)

    if ',' in meta.text:
        categories = [processor.collect_text(meta).split(',')[1].strip()]
    else:
        categories = [u'']

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1])
    ingress += ' ' + processor.collect_text(article.find(class_='esirivi'))
    ingress = ingress.strip()

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
        text = processor.process(text.strip())
        text += processor.collect_text(article.find(class_='korjaus'))

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary(domain, url, 200, categories,
                                       datetime_list, author, title, ingress,
                                       text, [u''], captions)
示例#24
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='keywords-block'))
    processor.decompose_all(article.find_all(class_='share-buttons-block'))
    processor.decompose(article('p')[-1])
    processor.decompose(article.footer)
    processor.decompose(article.find(class_='wp-user-avatar'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='single-post-date')
    processor.decompose(datetime_data.find(class_='category'))
    datetime_list = processor.collect_datetime(datetime_data)

    processor.decompose(article.find(class_='single-post-date'))

    author = processor.collect_text(
        article.find(class_='post-author').find('li'))
    title = processor.collect_text(article.find(class_='entry-title'))
    text = processor.collect_text(article.find(class_='post-content'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'https://demokraatti.fi')

    return processor.create_dictionary('Demokraatti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, [u''])
示例#25
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')

    author = processor.collect_text(article.find(class_='posted-on'))
    author = author.replace(' |', '')

    processor.decompose(article.find(class_='entry-meta'))

    title = processor.collect_text(article.find(class_='entry-title'))

    ingress = processor.collect_text(
        article.find(class_='entry-content__ingress'))
    processor.decompose(article.find(class_='entry-content__ingress'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='entry-header__caption'))
    text = processor.collect_text(article.find(class_='entry-content'))

    return processor.create_dictionary('Verkkouutiset', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       ingress, text, images, captions)
示例#26
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(
        article.find_all(class_='views-field-field-aamuset-related-images'))

    categories_element = soup.find(class_='tsv3-c-as-articletags')
    categories = processor.collect_categories(
        categories_element.find_all('li'))

    datetime_list = processor.collect_datetime(article.find('time'))

    author = processor.collect_text(article.find(class_='kirjoittaja'))
    processor.decompose(article.find(class_='kirjoittaja'))

    title = processor.collect_text(article.find(class_='otsikko'))
    text = processor.collect_text(
        article.find(class_='tsv3-c-as-article__textitem--teksti'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.aamuset.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='tsv3-c-as-article__attachment__caption'))

    return processor.create_dictionary('Aamuset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
示例#27
0
def parse_from_archive(url, content):
    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Satakunnan kansa', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    datetime_list = processor.collect_datetime(meta)

    category = processor.collect_text(meta).split(',')[1].strip()
    subcat = processor.collect_text(article.find(class_='jalkirivi'))

    categories = []
    for c in [category, subcat]:
        if c:
            categories.append(c)

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1])
    ingress += ' ' + processor.collect_text(article.find(class_='esirivi'))
    ingress = ingress.strip()

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
    text = processor.process(text.strip())
    text += processor.collect_text(article.find(class_='korjaus'))

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary('Satakunnan kansa', url, 200,
                                       categories, datetime_list, author,
                                       title, ingress, text, [u''], captions)
示例#28
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-articles-container'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='post-meta')
    processor.decompose(datetime_data.find(class_='category'))
    processor.decompose(datetime_data.find(class_='updated'))
    datetime_list = processor.collect_datetime(datetime_data)

    author = processor.collect_text(article.find(class_='author--main'))
    title = processor.collect_text(article.find(class_='heading--main'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    processor.decompose_all(article.find_all(class_='image-wrapper'))
    text = processor.collect_text(article.find(class_='content--main'))

    return processor.create_dictionary('Aamulehti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
示例#29
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='ad'))
    processor.decompose_all(article.find_all(class_='ad-container'))
    processor.decompose_all(article.find_all('style'))
    processor.decompose(article.find(id='fullWidthBottom'))

    categories = processor.collect_categories(
        article.find_all(class_='article-category'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(class_='author-name'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='lead-paragraph'))
    text = processor.collect_text(article.find(class_='editorial'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='img-container'), 'http:')
    captions = processor.collect_image_captions(
        article.find_all(class_='figcaption'))

    return processor.create_dictionary('Mtv', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
示例#30
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article-release-info__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article-release-info__time'))
    author = processor.collect_text(article.find(itemprop='author'))

    title_div = article.find(class_='article-single-heading')
    title = processor.collect_text(title_div.find('h1'))
    ingress = processor.collect_text(title_div.find('p'))

    text = processor.collect_text(
        article.find(class_='article-single-section__content'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.maaseuduntulevaisuus.fi')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Maaseudun tulevaisuus', url,
                                       r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)