def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('Keskisuomalainen', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(role='main') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) categories = processor.collect_categories( article.find_all(class_='article__section')) datetime_list = processor.collect_datetime( article.find(class_='article__published')) author = processor.collect_text(article.find(class_='article__author')) title = processor.collect_text(article.find(class_='article__title')) ingress = processor.collect_text(article.find(class_='article__summary')) text = processor.collect_text(article.find(class_='article__body')) images = processor.collect_images_by_parent( article.find_all(class_='article__images'), '') captions = processor.collect_image_captions( article.find_all(itemprop='caption description')) return processor.create_dictionary('Keskisuomalainen', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'content__wrapper' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) categories = processor.collect_categories( article.find_all( class_ = 'typography__category' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'meta-content' ) ) author = processor.collect_text( article.find( class_ = 'typography__author' ) ) title = processor.collect_text( article.find( class_ = 'content__title' ) ) ingress = processor.collect_text( article.find( class_ = 'content__intro' ) ) text = processor.collect_text( article.find( class_ = 'content__body' )) images = processor.collect_images_by_parent( article.find_all( class_ = 'content__main-gallery' ), '' ) captions = [None] for caption_div in article.find_all( class_ = 'content__main-gallery' ): caption = BeautifulSoup( caption_div.find( 'a' )['data-caption'], "html.parser" ) captions.append( processor.collect_text( caption ) ) captions.pop(0) return processor.create_dictionary('Seura', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) departments = article.find( class_ = 'field-name-field-department-tref' ) categories = processor.collect_categories( departments.find_all( 'a' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) ) author = article.find( class_ = 'author' ) if author != None: processor.decompose( author.find( class_ = 'img' ) ) author = processor.collect_text( author.find( 'h3' ) ) else: author = u'' title = processor.collect_text( article.find( 'h1' ) ) text = processor.collect_text( article.find( class_ = 'field field-name-body' ) ) images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '') captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) ) return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(id='sp-component') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) meta = article.find(class_='category_date') categories = processor.collect_categories(meta.find_all('a')) datetime_list = processor.collect_datetime(meta) author = processor.collect_text(article.find(class_='author_credits')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='itemIntroText')) images = processor.collect_images_by_parent( article.find_all(class_='itemImage'), 'https://www.karjalainen.fi') captions = processor.collect_image_captions( article.find_all(class_='itemImageCaption')) return processor.create_dictionary('Karjalainen', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='ad')) processor.decompose_all(article.find_all(class_='ad-container')) processor.decompose_all(article.find_all('style')) processor.decompose(article.find(id='fullWidthBottom')) categories = processor.collect_categories( article.find_all(class_='article-category')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='author-name')) title = processor.collect_text(article.find(class_='article-title')) ingress = processor.collect_text(article.find(class_='lead-paragraph')) text = processor.collect_text(article.find(class_='editorial')) images = processor.collect_images_by_parent( article.find_all(class_='img-container'), 'http:') captions = processor.collect_image_captions( article.find_all(class_='figcaption')) return processor.create_dictionary('Mtv', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)