Exemplo n.º 1
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('blockquote'))

    categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1]
    categories = processor.collect_categories(categories_list)

    datetime_list = processor.collect_datetime(article.find(class_='meta'))

    authors = article.find(class_='authors')
    author = ''
    for div in authors.find_all(class_='author'):
        author += processor.collect_text(div.find('p')) + ','
    author = author[:-1]

    processor.decompose(authors)

    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='lead'))

    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    processor.decompose(article.find(class_='sticky-outer-wrapper active'))
    processor.decompose(article.find('header'))
    processor.decompose(article.find('footer'))

    text = processor.collect_text(article).replace('0 0 0 0 ', '')

    return processor.create_dictionary('Kymen sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Exemplo n.º 2
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article__full')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article__meta__category'))

    title = processor.collect_text(article.find(class_='medium-title'))

    datetime_list = processor.collect_datetime(
        article.find(class_='article__meta__timestamp'))

    author = processor.collect_text(article.find(class_='author__name'))
    ingress = processor.collect_text(article.find(class_='lead'))

    text = ''
    for string in article.find_all('p'):
        text += ' ' + processor.collect_text(string)
    text = text.strip()

    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.ilkka.fi')

    captions = []
    for caption_element in article.find_all(
            lambda tag: tag.name == 'a' and 'data-caption' in tag.attrs):
        captions.append(caption_element['data-caption'])

    return processor.create_dictionary('Ilkka', url, r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
Exemplo n.º 3
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='node-wrap')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose_all(article.find_all(class_='tyrkkyBox'))
    processor.decompose(article.find(class_='avainsanat'))
    processor.decompose(article.find(class_='twitter-share-button'))
    processor.decompose(article.find(class_='fb-like'))
    processor.decompose(article.find(class_='moreLanka'))
    processor.decompose(article.find(class_='kredIso'))
    processor.decompose(article.find('cite'))

    meta = article.find(class_='juttutiedot')
    datetime_list = processor.collect_datetime(meta, )
    author = processor.collect_text(meta.find(class_='author'))
    processor.decompose(meta)

    title = processor.collect_text(article.find('h2'), True)
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='kuvaTekstiIso'))

    processor.decompose_all(article.find_all(class_='kuvaTekstiIso'))
    processor.decompose_all(article.find_all('figcaption'))

    text = processor.collect_text(article)

    return processor.create_dictionary('Vihreä lanka', url, r.status_code,
                                       [u''], datetime_list, author, title,
                                       u'', text, images, captions)
Exemplo n.º 4
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='region bottom'))
    processor.decompose(
        article.find(class_='field-name-field-related-content'))

    categories = processor.collect_categories(
        article.find_all(class_='field-name-field-category'))
    datetime_list = processor.collect_datetime(
        article.find(class_='field-name-post-date'), 'timedate')
    author = processor.collect_text(
        article.find(class_='field-name-field-author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(
        article.find(class_='field-name-field-summary'))
    text = processor.collect_text(article.find(class_='field-name-field-body'))

    images = []
    for img in processor.collect_images(article.find_all('img'), 'src', ''):
        if 'placeholder' not in img:
            images.append(img)

    captions = processor.collect_image_captions(
        article.find_all(class_='file-image-description-caption'))

    return processor.create_dictionary('Hyvä terveys', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Exemplo n.º 5
0
def parse_from_archive(url, content):

    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Kauppalehti', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    domain = 'Kauppalehti'
    if 'online' in meta.text:
        domain += ' Online'

    datetime_list = processor.collect_datetime(meta)

    if ',' in meta.text:
        categories = [processor.collect_text(meta).split(',')[1].strip()]
    else:
        categories = [u'']

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1])
    ingress += ' ' + processor.collect_text(article.find(class_='esirivi'))
    ingress = ingress.strip()

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
        text = processor.process(text.strip())
        text += processor.collect_text(article.find(class_='korjaus'))

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary(domain, url, 200, categories,
                                       datetime_list, author, title, ingress,
                                       text, [u''], captions)
Exemplo n.º 6
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='keywords-block'))
    processor.decompose_all(article.find_all(class_='share-buttons-block'))
    processor.decompose(article('p')[-1])
    processor.decompose(article.footer)
    processor.decompose(article.find(class_='wp-user-avatar'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='single-post-date')
    processor.decompose(datetime_data.find(class_='category'))
    datetime_list = processor.collect_datetime(datetime_data)

    processor.decompose(article.find(class_='single-post-date'))

    author = processor.collect_text(
        article.find(class_='post-author').find('li'))
    title = processor.collect_text(article.find(class_='entry-title'))
    text = processor.collect_text(article.find(class_='post-content'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'https://demokraatti.fi')

    return processor.create_dictionary('Demokraatti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, [u''])
Exemplo n.º 7
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    author = article.find(class_='author')
    if author != None:
        author = processor.collect_text(author.find('h3'))
        processor.decompose(author.find(class_='img'))
    else:
        author = u''

    categories = processor.collect_categories(
        article.find_all(class_='field-name-field-department-tref'))
    datetime_list = processor.collect_datetime(
        article.find(class_='field-name-post-date'))

    title = processor.collect_text(article.find(class_='field-name-title'))
    text = processor.collect_text(article.find(class_='field field-name-body'))
    images = processor.collect_images_by_parent(article.find_all(class_='img'),
                                                '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    return processor.create_dictionary('Iltamakasiini', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Exemplo n.º 8
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(
        article.find_all(class_='views-field-field-aamuset-related-images'))

    categories_element = soup.find(class_='tsv3-c-as-articletags')
    categories = processor.collect_categories(
        categories_element.find_all('li'))

    datetime_list = processor.collect_datetime(article.find('time'))

    author = processor.collect_text(article.find(class_='kirjoittaja'))
    processor.decompose(article.find(class_='kirjoittaja'))

    title = processor.collect_text(article.find(class_='otsikko'))
    text = processor.collect_text(
        article.find(class_='tsv3-c-as-article__textitem--teksti'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.aamuset.fi')
    captions = processor.collect_image_captions(
        article.find_all(class_='tsv3-c-as-article__attachment__caption'))

    return processor.create_dictionary('Aamuset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Exemplo n.º 9
0
def parse_from_archive(url, content):
    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Satakunnan kansa', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    meta = article.find(class_='hakutuloslahde')

    datetime_list = processor.collect_datetime(meta)

    category = processor.collect_text(meta).split(',')[1].strip()
    subcat = processor.collect_text(article.find(class_='jalkirivi'))

    categories = []
    for c in [category, subcat]:
        if c:
            categories.append(c)

    author = processor.collect_text(article.find(class_='signeeraus'))

    title = processor.collect_text(article.find(class_='otsikko'))

    ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1])
    ingress += ' ' + processor.collect_text(article.find(class_='esirivi'))
    ingress = ingress.strip()

    text_divs = article.find_all(class_='artikkelip')
    text = ''
    for text_content in text_divs:
        text += processor.collect_text(text_content) + ' '
    text = processor.process(text.strip())
    text += processor.collect_text(article.find(class_='korjaus'))

    captions = processor.collect_image_captions(
        article.find_all(class_='kuva'))

    return processor.create_dictionary('Satakunnan kansa', url, 200,
                                       categories, datetime_list, author,
                                       title, ingress, text, [u''], captions)
Exemplo n.º 10
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='content__wrapper')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='typography__category'))
    datetime_list = processor.collect_datetime(
        article.find(class_='meta-content'))
    author = processor.collect_text(article.find(class_='typography__author'))
    title = processor.collect_text(article.find(class_='content__title'))
    ingress = processor.collect_text(article.find(class_='content__intro'))
    text = processor.collect_text(article.find(class_='content__body'))
    images = processor.collect_images_by_parent(
        article.find_all(class_='content__main-gallery'), '')

    captions = [None]
    for caption_div in article.find_all(class_='content__main-gallery'):
        caption = BeautifulSoup(
            caption_div.find('a')['data-caption'], "html.parser")
        captions.append(processor.collect_text(caption))
    captions.pop(0)

    return processor.create_dictionary('Suomen kuvalehti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Exemplo n.º 11
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-content')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='related-articles-container'))

    categories = processor.collect_categories(
        article.find_all(class_='category'))

    datetime_data = article.find(class_='post-meta')
    processor.decompose(datetime_data.find(class_='category'))
    processor.decompose(datetime_data.find(class_='updated'))
    datetime_list = processor.collect_datetime(datetime_data)

    author = processor.collect_text(article.find(class_='author--main'))
    title = processor.collect_text(article.find(class_='heading--main'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    processor.decompose_all(article.find_all(class_='image-wrapper'))
    text = processor.collect_text(article.find(class_='content--main'))

    return processor.create_dictionary('Aamulehti', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)
Exemplo n.º 12
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    categories = processor.collect_categories(
        article.find_all(class_='article-release-info__section'))
    datetime_list = processor.collect_datetime(
        article.find(class_='article-release-info__time'))
    author = processor.collect_text(article.find(itemprop='author'))

    title_div = article.find(class_='article-single-heading')
    title = processor.collect_text(title_div.find('h1'))
    ingress = processor.collect_text(title_div.find('p'))

    text = processor.collect_text(
        article.find(class_='article-single-section__content'))
    images = processor.collect_images(article.find_all('img'), 'src',
                                      'http://www.maaseuduntulevaisuus.fi')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Maaseudun tulevaisuus', url,
                                       r.status_code, categories,
                                       datetime_list, author, title, ingress,
                                       text, images, captions)
Exemplo n.º 13
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return {}

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    thread = soup.find(class_='thread')
    info = thread.find(class_='user-info-big')
    breadcrumbs = []
    for li in thread.find(class_='breadcrumb').find_all('li'):
        breadcrumbs.append(processor.collect_text(li))

    title = breadcrumbs[-1]
    topics = breadcrumbs[1:-1]
    user = processor.collect_text(info.find(class_='user-info-name'))
    user_role = processor.collect_text(info.find(class_='user-info-role'))
    time = str(
        processor.collect_datetime(info.find(class_='user-info-timestamp'))[0])
    text = processor.collect_text(thread.find(class_='thread-text'))

    removed_comments = len(thread.find_all(class_='comment-removed'))
    processor.decompose_all(thread.find_all(class_='comment-removed'))

    answers = get_answers(thread.find_all(class_='answer-block-container'),
                          user)

    return {
        'title': title,
        'topics': topics,
        'user': user,
        'user_role': user_role,
        'time': time,
        'text': text,
        'answers': answers,
        'removed_comments': removed_comments
    }
Exemplo n.º 14
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find('footer'))
    processor.decompose_all(article.find_all(class_='cb-module-title'))
    processor.decompose_all(article.find_all('blockquote'))
    processor.decompose_all(article.find_all('aside'))

    categories = processor.collect_categories(
        article.find_all(class_='cb-category'))
    datetime_list = processor.collect_datetime(article.find(class_='cb-date'))
    author = processor.collect_text(article.find(class_='cb-author'))
    title = processor.collect_text(article.find(class_='entry-title'))
    ingress = processor.collect_text(
        article.find(class_='cb-entry-content').find('h4'), True)
    text = processor.collect_text(article.find(class_='cb-entry-content'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='caption'))

    return processor.create_dictionary('Kansan uutiset', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Exemplo n.º 15
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='post-single')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='avatar'))

    categories = processor.collect_categories(
        article.find_all(itemprop='articleSection'))
    datetime_list = processor.collect_datetime(
        article.find(itemprop='dateCreated datePublished'))
    author = processor.collect_text(article.find(rel='author'))
    title = processor.collect_text(article.find(itemprop='headline'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='sopuli-image-caption'))

    processor.decompose_all(article.find_all(itemprop='associatedMedia'))
    text = processor.collect_text(article.find(itemprop='articleBody'))

    return processor.create_dictionary('Kokemäenjokilaakson uutiset', url,
                                       r.status_code, categories,
                                       datetime_list, author, title, u'', text,
                                       images, captions)
Exemplo n.º 16
0
def parse_from_archive(url, content):
    article = BeautifulSoup(content, "html.parser")

    if article == None:
        return processor.create_dictionary('Keskisuomalainen', url, 404, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))

    meta = article.find(class_='date')

    categories = [processor.collect_text(meta).split(' ')[0]]
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(article.find(class_='author'), True)

    processor.decompose(meta)

    title_parts = article.find_all('h2')
    title = ''
    for part in title_parts:
        title += processor.collect_text(part, True) + ' '
    title = title.strip()

    ingress_parts = article.find_all('h4')
    ingress = ''
    for part in ingress_parts:
        ingress += processor.collect_text(part, True) + ' '
    ingress = ingress.strip()

    processor.decompose(article.find_all('p')[-1])

    text = processor.collect_text(article)

    return processor.create_dictionary('Keskisuomalainen', url, 200,
                                       categories, datetime_list, author,
                                       title, ingress, text, [u''], [u''])
Exemplo n.º 17
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    root = soup.find(id='root')
    article_container = root.contents[0].contents[1].contents[3]

    article = article_container.contents[0].contents[2].contents[2]
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('aside'))

    categories = processor.collect_categories([article.find('h4')])
    datetime_list = processor.collect_datetime(article.contents[0])
    title = processor.collect_text(article.find('h1'))

    text_section = article.find('section')
    ingress = processor.collect_text(text_section.find('h3'))
    text_container = text_section.contents[0].contents[5]
    text = processor.collect_text(text_container)

    images = processor.collect_images([article.find('img')], 'src', '')
    captions = processor.collect_image_captions(article.find_all('figcaption'))

    return processor.create_dictionary('Talouselämä', url, r.status_code,
                                       categories, datetime_list, u'', title,
                                       ingress, text, images, captions)
Exemplo n.º 18
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='single-article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='print-url'))
    processor.decompose_all(article.find_all(class_='article-ad-block'))

    category = url.split('/')[3]
    categories = [category.capitalize().encode('utf8')]

    datetime_list = processor.collect_datetime(
        article.find(itemprop='datePublished'))
    author = processor.collect_text(article.find(itemprop='author'))
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='ingress'))
    text = processor.collect_text(article.find(class_='body'))
    images = processor.collect_images(article.find_all('img'), 'src', 'http:')
    captions = processor.collect_image_captions(
        article.find_all(itemprop='caption'))

    return processor.create_dictionary('Iltasanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Exemplo n.º 19
0
def get_comments(comments_html_element, to_user):
    comments = []

    comments_list = comments_html_element.find(class_='comments-list')

    if comments_list:

        for comment_div in comments_list.find_all(class_='comment'):
            comment_data = {}

            comment_data['likes'] = processor.collect_text(
                comment_div.find(class_='action-bar-vote-count'))
            processor.decompose(comment_div.find(class_='action-bar'))

            comment_data['user'] = processor.collect_text(
                comment_div.find(class_='user-info-name'))
            comment_data['user_role'] = processor.collect_text(
                comment_div.find(class_='user-info-role'))
            comment_data['time'] = str(
                processor.collect_datetime(
                    comment_div.find(class_='user-info-timestamp'))[0])
            comment_data['text'] = processor.collect_text(
                comment_div.find(class_='comment-text'))
            comment_data['to'] = to_user
            comment_data['quote'] = {}

            blockquote = comment_div.find('blockquote')
            if blockquote:
                comment_data['quote']['quoted_user'] = processor.collect_text(
                    blockquote.find('header').find('strong'))
                comment_data['quote']['text'] = processor.collect_text(
                    blockquote.find(
                        class_='text-muted blockquote-collapse-body'))

            comments.append(comment_data)

    return comments
Exemplo n.º 20
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find('article')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose_all(article.find_all(class_='attImage'))

    meta = article.find('time')

    categories = processor.collect_categories(meta.find_all('b'))
    datetime_list = processor.collect_datetime(meta)

    author = processor.collect_text(article.find(class_='Kirjoittaja'), True)
    title = processor.collect_text(article.find('h1'))
    ingress = processor.collect_text(article.find(class_='Alaotsikko'))
    text = processor.collect_text(article.find(class_='Teksti'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='featuredCaption'))

    return processor.create_dictionary('Kainuun sanomat', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, ingress, text, images, captions)
Exemplo n.º 21
0
def parse(url):

    r = requests.get(url)
    if r.status_code == 404:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    r.encoding = 'UTF-8'
    soup = BeautifulSoup(r.text, "html.parser")

    article = soup.find(class_='article-container')
    if article == None:
        return processor.create_dictionary('', url, r.status_code, [u''],
                                           [u''], u'', u'', u'', u'', [u''],
                                           [u''])

    processor.decompose_all(article.find_all('script'))
    processor.decompose(article.find(class_='article__related'))
    processor.decompose_all(
        article.find_all(class_='smartblock--juttusivu-markkinointi'))

    meta = article.find(class_='news__meta')

    categories = [processor.collect_text(meta).split(' ')[0]]
    datetime_list = processor.collect_datetime(meta)
    author = processor.collect_text(meta.find(class_='news__source'))
    title = processor.collect_text(article.find('h1'))
    text = processor.collect_text(article.find(class_='article__text'))
    images = processor.collect_images(article.find_all('img'), 'src', '')
    captions = processor.collect_image_captions(
        article.find_all(class_='image__caption'))

    return processor.create_dictionary('Kaleva', url, r.status_code,
                                       categories, datetime_list, author,
                                       title, u'', text, images, captions)