예제 #1
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'region-content-inner' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find_all( 'noscript' ) )
	processor.decompose( article.find( id = 'comments' ) )
	processor.decompose( article.find( class_ = 'contributor' ) )
	processor.decompose( article.find( class_ = 'field-name-field-author-image' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'field-name-field-category' ) )
	datetime_list = processor.collect_datetime_objects( article.find_all( class_ = 'date-display-single' ), 'content' )
	author = processor.collect_text( article.find( class_ = 'author-name' ) )
	title = processor.collect_text( article.find( id = 'page-title' ) )
	text = processor.collect_text( article.find( class_ = 'field-name-body' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'field-name-field-image-description' ) )

	return processor.create_dictionary('Uusi Suomi', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
예제 #2
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'main-content-area' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	meta = article.find( class_ = 'post-meta' )

	categories = processor.collect_categories( meta.find_all( class_ = 'category' ), True )
	datetime_list = processor.collect_datetime( meta, 'datetime date' )
	author = processor.collect_text( article.find( class_ = 'author--main' ) )
	title = processor.collect_text( article.find( class_ = 'heading--main' ) )
	ingress= processor.collect_text( article.find( class_ = 'heading--secondary' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) )

	processor.decompose_all( article.find_all( class_ = 'image-wrapper' ) )

	text = processor.collect_text( article.find( class_ = 'content--main' ) )

	return processor.create_dictionary('Lapin kansa', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
예제 #3
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find( 'header' ).find_all( 'img' ) )
	processor.decompose_all( article.find_all( 'blockquote' ) )
	processor.decompose( article.find( class_ = "meta-sidebar" ) )

	categories = processor.collect_categories( article.find_all( class_ = 'cat' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'date' ) )
	author = processor.collect_text( article.find( class_ = 'author' ) )
	title = processor.collect_text( article.find( class_ = 'article-title' ) )
	ingress = processor.collect_text( article.find( class_ = 'ingress' ), True )
	text = processor.collect_text( article.find( class_ = 'content' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'featured-image' ) )

	return processor.create_dictionary('Kd-lehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
예제 #4
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='single-article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-links'))
    processor.decompose_all(article.find_all(class_='article-ad-block'))

    categories = processor.collect_categories(
        soup.find(class_='section-title'))
    datetime_list = processor.collect_datetime(article.find('time'))
    author = processor.collect_text(article.find(class_='byline'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='ingress'))
    text = processor.collect_text(article.find(class_='body'))
    images = processor.collect_images(article.find_all('img'), 'src', 'http:')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Taloussanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #5
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('Keskisuomalainen', url,
                                           r.status_code, [u''], [u''], u'',
                                           u'', u'', u'', [u''], [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(role='main')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article__published'))
    author = processor.collect_text(article.find(class_='article__author'))
    title = processor.collect_text(article.find(class_='article__title'))
    ingress = processor.collect_text(article.find(class_='article__summary'))
    text = processor.collect_text(article.find(class_='article__body'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='article__images'), '')
    captions = processor.collect_image_captions(
        article.find_all(itemprop='caption description'))

    return processor.create_dictionary('Keskisuomalainen', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #6
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose_all( article.find_all( class_ = 'somebar' ) )
	processor.decompose( article.find( class_ = 'tags' ) )

	categories = processor.collect_categories( article.find_all( class_ = 'post-category' ) )

	datetime_string = article.find( class_ = 'timestamp' ).get_text( ' ', strip = True )
	datetime_string = processor.convert_month( datetime_string.replace( ',', '' ) )
	datetime_list = [datetime.strptime( datetime_string, '%m %d %Y %H:%M' )]

	author = processor.collect_text( article.find( class_ = 'article-page-writer' ), True )
	title = processor.collect_text( article.find( class_ = 'post-title' ) )
	text = processor.collect_text( article.find( class_ = 'post-content' ) )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( 'figcaption' ) )

	return processor.create_dictionary('Suomen uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
예제 #7
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( class_ = 'mainArticle-content-wrapper' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	header = article.find( id = 'main-article-header' )
	categories = processor.collect_categories( header.find_all( class_ = 'section' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'article-date' ) )
	author = processor.collect_text( article.find( class_ = 'authorName' ) )
	title = processor.collect_text( article.find( class_ = 'main-article-header' ) )
	text = processor.collect_text( article.find( class_ = 'body' ) )

	processor.decompose( article.find( class_ = 'authorPicture' ) )
	processor.decompose( article.find( id = 'main-subscribe' ) )

	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'main-media-caption' ) )

	return processor.create_dictionary('Etelä-Suomen Sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
예제 #8
0
def parse_from_archive(url, content):

    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Kainuun sanomat', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    datetime_list = processor.collect_datetime(meta)

    categories = [processor.collect_text(meta).split(',')[1].strip()]

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
    text = text.strip()

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary('Kainuun sanomat', url, 200, categories,
                                       datetime_list, author, title, u'', text,
                                       [u''], captions)
예제 #9
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(id='sp-component')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    meta = article.find(class_='category_date')

    categories = processor.collect_categories(meta.find_all('a'))
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(article.find(class_='author_credits'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='itemIntroText'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='itemImage'), 'https://www.karjalainen.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='itemImageCaption'))

    return processor.create_dictionary('Karjalainen', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
예제 #10
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'iso-8859-1'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( id = 'container_keski' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )
	processor.decompose( article.find( class_ = 'kp-share-area' ) )

	categories = processor.collect_categories( soup.find_all( class_ = 'sel' ) )
	datetime_list = processor.collect_datetime( article.find( class_ = 'juttuaika' ) )

	author_div = article.find( class_ = 'author' )
	processor.decompose( author_div.find( 'a' ) )
	author = processor.collect_text( author_div, True )

	title = processor.collect_text( article.find( 'h1' ) )
	ingress = processor.collect_text( article.find( class_ = 'ingressi' ), True )
	images = processor.collect_images( article.find_all( 'img' ), 'src', '' )
	captions = processor.collect_image_captions( article.find_all( class_ = 'kuvateksti' ) )

	processor.decompose_all( article.find_all( class_ = 'kuvamiddle' ) )

	text = processor.collect_text( article.find( 'isense' ) )

	return processor.create_dictionary('Iltalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
예제 #11
0
def parse( url ):

	r = requests.get( url )
	if r.status_code == 404:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	r.encoding = 'UTF-8'
	soup = BeautifulSoup( r.text, "html.parser" )

	article = soup.find( 'article' )
	if article == None:
		return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u''])

	processor.decompose_all( article.find_all( 'script' ) )

	departments = article.find( class_ = 'field-name-field-department-tref' )
	categories = processor.collect_categories( departments.find_all( 'a' ) )

	datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) )

	author = article.find( class_ = 'author' )
	if author != None:
		processor.decompose( author.find( class_ = 'img' ) )
		author = processor.collect_text( author.find( 'h3' ) )
	else:
		author = u''

	title = processor.collect_text( article.find( 'h1' ) )
	text = processor.collect_text( article.find( class_ = 'field field-name-body' ) )
	images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '')
	captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) )

	return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
예제 #12
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose(article.find(class_='sticky-inner-wrapper'))

    categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1]
    categories = processor.collect_categories(categories_list)

    datetime_list = processor.collect_datetime(article.find(class_='meta'))

    authors = article.find(class_='authors')
    author = ''
    for div in authors.find_all(class_='author'):
        author += processor.collect_text(div.find('p')) + ','
    author = author[:-1]

    processor.decompose(authors)

    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='lead'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    processor.decompose(article.find(class_='sticky-outer-wrapper active'))
    processor.decompose(article.find('header'))
    processor.decompose(article.find('footer'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Kouvolan sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #13
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='node-wrap')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose_all(article.find_all(class_='tyrkkyBox'))
    processor.decompose(article.find(class_='avainsanat'))
    processor.decompose(article.find(class_='twitter-share-button'))
    processor.decompose(article.find(class_='fb-like'))
    processor.decompose(article.find(class_='moreLanka'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose(article.find('cite'))

    meta = article.find(class_='juttutiedot')
    datetime_list = processor.collect_datetime(meta, )
    author = processor.collect_text(meta.find(class_='author'))
    processor.decompose(meta)

    title = processor.collect_text(article.find('h2'), True)
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='kuvaTekstiIso'))

    processor.decompose_all(article.find_all(class_='kuvaTekstiIso'))
    processor.decompose_all(article.find_all('figcaption'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Vihreä lanka', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       u'', text, images, captions)
예제 #14
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='region bottom'))
    processor.decompose(
        article.find(class_='field-name-field-related-content'))

    categories = processor.collect_categories(
        article.find_all(class_='field-name-field-category'))
    datetime_list = processor.collect_datetime(
        article.find(class_='field-name-post-date'), 'timedate')
    author = processor.collect_text(
        article.find(class_='field-name-field-author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(
        article.find(class_='field-name-field-summary'))
    text = processor.collect_text(article.find(class_='field-name-field-body'))

    images = []
    for img in processor.collect_images(article.find_all('img'), 'src', ''):
        if 'placeholder' not in img:
            images.append(img)

    captions = processor.collect_image_captions(
        article.find_all(class_='file-image-description-caption'))

    return processor.create_dictionary('Hyvä terveys', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #15
0
def parse_from_archive(url, content):

    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Kauppalehti', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    domain = 'Kauppalehti'
    if 'online' in meta.text:
        domain += ' Online'

    datetime_list = processor.collect_datetime(meta)

    if ',' in meta.text:
        categories = [processor.collect_text(meta).split(',')[1].strip()]
    else:
        categories = [u'']

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1])
    ingress += ' ' + processor.collect_text(article.find(class_='esirivi'))
    ingress = ingress.strip()

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
        text = processor.process(text.strip())
        text += processor.collect_text(article.find(class_='korjaus'))

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary(domain, url, 200, categories,
                                       datetime_list, author, title, ingress,
                                       text, [u''], captions)
예제 #16
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')

    author = processor.collect_text(article.find(class_='posted-on'))
    author = author.replace(' |', '')

    processor.decompose(article.find(class_='entry-meta'))

    title = processor.collect_text(article.find(class_='entry-title'))

    ingress = processor.collect_text(
        article.find(class_='entry-content__ingress'))
    processor.decompose(article.find(class_='entry-content__ingress'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='entry-header__caption'))
    text = processor.collect_text(article.find(class_='entry-content'))

    return processor.create_dictionary('Verkkouutiset', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       ingress, text, images, captions)
예제 #17
0
def parse_from_archive(url, content):
    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Satakunnan kansa', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    datetime_list = processor.collect_datetime(meta)

    category = processor.collect_text(meta).split(',')[1].strip()
    subcat = processor.collect_text(article.find(class_='jalkirivi'))

    categories = []
    for c in [category, subcat]:
        if c:
            categories.append(c)

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1])
    ingress += ' ' + processor.collect_text(article.find(class_='esirivi'))
    ingress = ingress.strip()

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
    text = processor.process(text.strip())
    text += processor.collect_text(article.find(class_='korjaus'))

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary('Satakunnan kansa', url, 200,
                                       categories, datetime_list, author,
                                       title, ingress, text, [u''], captions)
예제 #18
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(
        article.find_all(class_='views-field-field-aamuset-related-images'))

    categories_element = soup.find(class_='tsv3-c-as-articletags')
    categories = processor.collect_categories(
        categories_element.find_all('li'))

    datetime_list = processor.collect_datetime(article.find('time'))

    author = processor.collect_text(article.find(class_='kirjoittaja'))
    processor.decompose(article.find(class_='kirjoittaja'))

    title = processor.collect_text(article.find(class_='otsikko'))
    text = processor.collect_text(
        article.find(class_='tsv3-c-as-article__textitem--teksti'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.aamuset.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='tsv3-c-as-article__attachment__caption'))

    return processor.create_dictionary('Aamuset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
예제 #19
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-articles-container'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='post-meta')
    processor.decompose(datetime_data.find(class_='category'))
    processor.decompose(datetime_data.find(class_='updated'))
    datetime_list = processor.collect_datetime(datetime_data)

    author = processor.collect_text(article.find(class_='author--main'))
    title = processor.collect_text(article.find(class_='heading--main'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    processor.decompose_all(article.find_all(class_='image-wrapper'))
    text = processor.collect_text(article.find(class_='content--main'))

    return processor.create_dictionary('Aamulehti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
예제 #20
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article-release-info__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article-release-info__time'))
    author = processor.collect_text(article.find(itemprop='author'))

    title_div = article.find(class_='article-single-heading')
    title = processor.collect_text(title_div.find('h1'))
    ingress = processor.collect_text(title_div.find('p'))

    text = processor.collect_text(
        article.find(class_='article-single-section__content'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.maaseuduntulevaisuus.fi')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Maaseudun tulevaisuus', url,
                                       r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
예제 #21
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='ad'))
    processor.decompose_all(article.find_all(class_='ad-container'))
    processor.decompose_all(article.find_all('style'))
    processor.decompose(article.find(id='fullWidthBottom'))

    categories = processor.collect_categories(
        article.find_all(class_='article-category'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(class_='author-name'))
    title = processor.collect_text(article.find(class_='article-title'))
    ingress = processor.collect_text(article.find(class_='lead-paragraph'))
    text = processor.collect_text(article.find(class_='editorial'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='img-container'), 'http:')
    captions = processor.collect_image_captions(
        article.find_all(class_='figcaption'))

    return processor.create_dictionary('Mtv', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
예제 #22
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find('footer'))
    processor.decompose_all(article.find_all(class_='cb-module-title'))
    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose_all(article.find_all('aside'))

    categories = processor.collect_categories(
        article.find_all(class_='cb-category'))
    datetime_list = processor.collect_datetime(article.find(class_='cb-date'))
    author = processor.collect_text(article.find(class_='cb-author'))
    title = processor.collect_text(article.find(class_='entry-title'))
    ingress = processor.collect_text(
        article.find(class_='cb-entry-content').find('h4'), True)
    text = processor.collect_text(article.find(class_='cb-entry-content'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    return processor.create_dictionary('Kansan uutiset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #23
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='post-single')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='avatar'))

    categories = processor.collect_categories(
        article.find_all(itemprop='articleSection'))
    datetime_list = processor.collect_datetime(
        article.find(itemprop='dateCreated datePublished'))
    author = processor.collect_text(article.find(rel='author'))
    title = processor.collect_text(article.find(itemprop='headline'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='sopuli-image-caption'))

    processor.decompose_all(article.find_all(itemprop='associatedMedia'))
    text = processor.collect_text(article.find(itemprop='articleBody'))

    return processor.create_dictionary('Kokemäenjokilaakson uutiset', url,
                                       r.status_code, categories,
                                       datetime_list, author, title, u'', text,
                                       images, captions)
예제 #24
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='post')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose(article.find(class_='author-avatar'))
    processor.decompose(
        article.find(id='after-single-post-widget-zone-single-post'))
    processor.decompose(article.find(id='sidebar'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(itemprop='name'))
    title = processor.collect_text(article.find(class_=' xt-post-title'))
    text = processor.collect_text(article.find(class_='post-body'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Mahorkka', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
예제 #25
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-body')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    departments = article.find(class_='departments')
    categories = processor.collect_categories(departments.find_all('a'))
    datetime_list = processor.collect_datetime_objects(
        article.find_all('time'), 'datetime')
    author = processor.collect_text(article.find(class_='author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='ingress'))

    # This does not get the text because HBL demands registration
    text = processor.collect_text(article.find(class_='text'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='ksf-image-meta'))

    return processor.create_dictionary('Hufvudstadsbladet', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #26
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    root = soup.find(id='root')
    article_container = root.contents[0].contents[1].contents[3]

    article = article_container.contents[0].contents[2].contents[2]
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('aside'))

    categories = processor.collect_categories([article.find('h4')])
    datetime_list = processor.collect_datetime(article.contents[0])
    title = processor.collect_text(article.find('h1'))

    text_section = article.find('section')
    ingress = processor.collect_text(text_section.find('h3'))
    text_container = text_section.contents[0].contents[5]
    text = processor.collect_text(text_container)

    images = processor.collect_images([article.find('img')], 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Talouselämä', url, r.status_code,
                                       categories, datetime_list, u'', title,
                                       ingress, text, images, captions)
예제 #27
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='single-article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='print-url'))
    processor.decompose_all(article.find_all(class_='article-ad-block'))

    category = url.split('/')[3]
    categories = [category.capitalize().encode('utf8')]

    datetime_list = processor.collect_datetime(
        article.find(itemprop='datePublished'))
    author = processor.collect_text(article.find(itemprop='author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='ingress'))
    text = processor.collect_text(article.find(class_='body'))
    images = processor.collect_images(article.find_all('img'), 'src', 'http:')
    captions = processor.collect_image_captions(
        article.find_all(itemprop='caption'))

    return processor.create_dictionary('Iltasanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
예제 #28
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    meta = article.find(class_='tsv3-c-common-article__meta__row1')

    categories = processor.collect_categories(meta.find_all('a'))
    datetime_list = processor.collect_datetime_objects(meta.find_all('time'),
                                                       'datetime')
    author = processor.collect_text(article.find(class_='kirjoittaja'))
    title = processor.collect_text(article.find(class_='otsikko'))
    text = processor.collect_text(
        article.find(class_='tsv3-c-common-article__textitem--teksti'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.ts.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='tsv3-c-common-article__attachment__caption'))

    return processor.create_dictionary('Turun sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
예제 #29
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-container')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='article__related'))
    processor.decompose_all(
        article.find_all(class_='smartblock--juttusivu-markkinointi'))

    meta = article.find(class_='news__meta')

    categories = [processor.collect_text(meta).split(' ')[0]]
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(meta.find(class_='news__source'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='article__text'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='image__caption'))

    return processor.create_dictionary('Kaleva', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
예제 #30
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='attImage'))

    meta = article.find('time')

    categories = processor.collect_categories(meta.find_all('b'))
    datetime_list = processor.collect_datetime(meta)

    author = processor.collect_text(article.find(class_='Kirjoittaja'), True)
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='Alaotsikko'))
    text = processor.collect_text(article.find(class_='Teksti'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='featuredCaption'))

    return processor.create_dictionary('Kainuun sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)