def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='single-article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='related-links')) processor.decompose_all(article.find_all(class_='article-ad-block')) categories = processor.collect_categories( soup.find(class_='section-title')) datetime_list = processor.collect_datetime(article.find('time')) author = processor.collect_text(article.find(class_='byline')) title = processor.collect_text(article.find(class_='article-title')) ingress = processor.collect_text(article.find(class_='ingress')) text = processor.collect_text(article.find(class_='body')) images = processor.collect_images(article.find_all('img'), 'src', 'http:') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Taloussanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('main') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='nosto')) links = article.find(class_='links') categories = processor.collect_categories(links.find_all('li')) datetime_list = processor.collect_datetime( article.find(class_='field-name-field-publish-date')) author = processor.collect_text(article.find(class_='tekija')) title = processor.collect_text(article.find(id='page-title')) text = processor.collect_text(article.find(class_='body')) images = processor.collect_images( article.find(class_='views-field-field-op-main-image').find_all('img'), 'src', '') return processor.create_dictionary('Tiedonantaja', url, r.status_code, categories, datetime_list, author, title, u'', text, images, [u''])
def parse(url): api_path = 'http://yle.fi/ylex/api/article/' _id = url.split('/')[-1] r = requests.get(api_path + _id) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' json = r.json() categories = [processor.process(json['homesection']['name'])] datetime_list = processor.collect_datetime_json(json, 'datePublished', 'dateModified') author = processor.process(json['authors'][0]['name']) title = processor.process(json['title']) ingress = processor.process(json['lead']) text_html = BeautifulSoup(json['html'], "html.parser") text = processor.collect_text(text_html) if 'image' in json: image_json = json['image'] images = [image_json['uri']] captions = [image_json['alt']] else: images, captions = [u''], [u''] return processor.create_dictionary('Yle X', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'content__wrapper' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) categories = processor.collect_categories( article.find_all( class_ = 'typography__category' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'meta-content' ) ) author = processor.collect_text( article.find( class_ = 'typography__author' ) ) title = processor.collect_text( article.find( class_ = 'content__title' ) ) ingress = processor.collect_text( article.find( class_ = 'content__intro' ) ) text = processor.collect_text( article.find( class_ = 'content__body' )) images = processor.collect_images_by_parent( article.find_all( class_ = 'content__main-gallery' ), '' ) captions = [None] for caption_div in article.find_all( class_ = 'content__main-gallery' ): caption = BeautifulSoup( caption_div.find( 'a' )['data-caption'], "html.parser" ) captions.append( processor.collect_text( caption ) ) captions.pop(0) return processor.create_dictionary('Seura', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse_from_archive(url, content): article = BeautifulSoup(content, "html.parser") if article == None: return processor.create_dictionary('Kainuun sanomat', url, 404, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) meta = article.find(class_='hakutuloslahde') datetime_list = processor.collect_datetime(meta) categories = [processor.collect_text(meta).split(',')[1].strip()] author = processor.collect_text(article.find(class_='signeeraus')) title = processor.collect_text(article.find(class_='otsikko')) text_divs = article.find_all(class_='artikkelip') text = '' for text_content in text_divs: text += processor.collect_text(text_content) + ' ' text = text.strip() captions = processor.collect_image_captions( article.find_all(class_='kuva')) return processor.create_dictionary('Kainuun sanomat', url, 200, categories, datetime_list, author, title, u'', text, [u''], captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'region-content-inner' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find_all( 'noscript' ) ) processor.decompose( article.find( id = 'comments' ) ) processor.decompose( article.find( class_ = 'contributor' ) ) processor.decompose( article.find( class_ = 'field-name-field-author-image' ) ) categories = processor.collect_categories( article.find_all( class_ = 'field-name-field-category' ) ) datetime_list = processor.collect_datetime_objects( article.find_all( class_ = 'date-display-single' ), 'content' ) author = processor.collect_text( article.find( class_ = 'author-name' ) ) title = processor.collect_text( article.find( id = 'page-title' ) ) text = processor.collect_text( article.find( class_ = 'field-name-body' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'field-name-field-image-description' ) ) return processor.create_dictionary('Uusi Suomi', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'iso-8859-1' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( id = 'container_keski' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose( article.find( class_ = 'kp-share-area' ) ) categories = processor.collect_categories( soup.find_all( class_ = 'sel' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'juttuaika' ) ) author_div = article.find( class_ = 'author' ) processor.decompose( author_div.find( 'a' ) ) author = processor.collect_text( author_div, True ) title = processor.collect_text( article.find( 'h1' ) ) ingress = processor.collect_text( article.find( class_ = 'ingressi' ), True ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'kuvateksti' ) ) processor.decompose_all( article.find_all( class_ = 'kuvamiddle' ) ) text = processor.collect_text( article.find( 'isense' ) ) return processor.create_dictionary('Iltalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) title = processor.collect_text( article.find( class_ = 'entry-title' ) ) url_elements = url.split('/') year = url_elements[4] month = url_elements[5] day = url_elements[6] datetime_list = [datetime.date(datetime.strptime(day + '.' + month + '.' + year, "%d.%m.%Y"))] author = processor.collect_text( article.find( class_ = 'author vcard' ) ) text = processor.collect_text( article.find( class_ = 'entry-content' ) ) return processor.create_dictionary('Iltalehti Blogit', url, r.status_code, [u''], datetime_list, author, title, u'', text, [u''], [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(id='main-content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='reviewpic')) datetime_list = processor.collect_datetime( article.find(class_='published')) author = processor.collect_text(article.find(class_='author')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='entry-content')) images = processor.collect_images(article.find_all('img'), 'src', '') return processor.create_dictionary('Faktabaari', url, r.status_code, [u''], datetime_list, author, title, u'', text, images, [u''])
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find( 'header' ).find_all( 'img' ) ) processor.decompose_all( article.find_all( 'blockquote' ) ) processor.decompose( article.find( class_ = "meta-sidebar" ) ) categories = processor.collect_categories( article.find_all( class_ = 'cat' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'date' ) ) author = processor.collect_text( article.find( class_ = 'author' ) ) title = processor.collect_text( article.find( class_ = 'article-title' ) ) ingress = processor.collect_text( article.find( class_ = 'ingress' ), True ) text = processor.collect_text( article.find( class_ = 'content' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'featured-image' ) ) return processor.create_dictionary('Kd-lehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(id='sp-component') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) meta = article.find(class_='category_date') categories = processor.collect_categories(meta.find_all('a')) datetime_list = processor.collect_datetime(meta) author = processor.collect_text(article.find(class_='author_credits')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='itemIntroText')) images = processor.collect_images_by_parent( article.find_all(class_='itemImage'), 'https://www.karjalainen.fi') captions = processor.collect_image_captions( article.find_all(class_='itemImageCaption')) return processor.create_dictionary('Karjalainen', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'mainArticle-content-wrapper' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) header = article.find( id = 'main-article-header' ) categories = processor.collect_categories( header.find_all( class_ = 'section' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'article-date' ) ) author = processor.collect_text( article.find( class_ = 'authorName' ) ) title = processor.collect_text( article.find( class_ = 'main-article-header' ) ) text = processor.collect_text( article.find( class_ = 'body' ) ) processor.decompose( article.find( class_ = 'authorPicture' ) ) processor.decompose( article.find( id = 'main-subscribe' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'main-media-caption' ) ) return processor.create_dictionary('Etelä-Suomen Sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) departments = article.find( class_ = 'field-name-field-department-tref' ) categories = processor.collect_categories( departments.find_all( 'a' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'field-name-post-date' ) ) author = article.find( class_ = 'author' ) if author != None: processor.decompose( author.find( class_ = 'img' ) ) author = processor.collect_text( author.find( 'h3' ) ) else: author = u'' title = processor.collect_text( article.find( 'h1' ) ) text = processor.collect_text( article.find( class_ = 'field field-name-body' ) ) images = processor.collect_images_by_parent( article.find_all( class_ = 'img' ), '') captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) ) return processor.create_dictionary('Helsingin uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('Keskisuomalainen', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(role='main') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) categories = processor.collect_categories( article.find_all(class_='article__section')) datetime_list = processor.collect_datetime( article.find(class_='article__published')) author = processor.collect_text(article.find(class_='article__author')) title = processor.collect_text(article.find(class_='article__title')) ingress = processor.collect_text(article.find(class_='article__summary')) text = processor.collect_text(article.find(class_='article__body')) images = processor.collect_images_by_parent( article.find_all(class_='article__images'), '') captions = processor.collect_image_captions( article.find_all(itemprop='caption description')) return processor.create_dictionary('Keskisuomalainen', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find_all( class_ = 'somebar' ) ) processor.decompose( article.find( class_ = 'tags' ) ) categories = processor.collect_categories( article.find_all( class_ = 'post-category' ) ) datetime_string = article.find( class_ = 'timestamp' ).get_text( ' ', strip = True ) datetime_string = processor.convert_month( datetime_string.replace( ',', '' ) ) datetime_list = [datetime.strptime( datetime_string, '%m %d %Y %H:%M' )] author = processor.collect_text( article.find( class_ = 'article-page-writer' ), True ) title = processor.collect_text( article.find( class_ = 'post-title' ) ) text = processor.collect_text( article.find( class_ = 'post-content' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( 'figcaption' ) ) return processor.create_dictionary('Suomen uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'main-content-area' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) meta = article.find( class_ = 'post-meta' ) categories = processor.collect_categories( meta.find_all( class_ = 'category' ), True ) datetime_list = processor.collect_datetime( meta, 'datetime date' ) author = processor.collect_text( article.find( class_ = 'author--main' ) ) title = processor.collect_text( article.find( class_ = 'heading--main' ) ) ingress= processor.collect_text( article.find( class_ = 'heading--secondary' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) ) processor.decompose_all( article.find_all( class_ = 'image-wrapper' ) ) text = processor.collect_text( article.find( class_ = 'content--main' ) ) return processor.create_dictionary('Lapin kansa', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): api_path = 'https://www.kauppalehti.fi/api/news/article/' _id = url.split('/')[-1] r = requests.get(api_path + _id) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' json = r.json() categories = [processor.process(json['mainCategory']['name'])] datetime_list = processor.collect_datetime_json(json, 'published', 'modified') author = processor.process(json['byline'][0]) title = processor.process(json['title']) ingress = processor.process(json['headline']) text_html = BeautifulSoup(json['body'], "html.parser") text = processor.collect_text(text_html) if 'keyImage' in json: image_url = 'http://images.kauppalehti.fi/547x/http:' + json['keyImage'] images = [image_url] else: images = [u''] return processor.create_dictionary('Kauppalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('blockquote')) processor.decompose(article.find(class_='sticky-inner-wrapper')) categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1] categories = processor.collect_categories(categories_list) datetime_list = processor.collect_datetime(article.find(class_='meta')) authors = article.find(class_='authors') author = '' for div in authors.find_all(class_='author'): author += processor.collect_text(div.find('p')) + ',' author = author[:-1] processor.decompose(authors) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text(article.find(class_='lead')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions(article.find_all('figcaption')) processor.decompose(article.find(class_='sticky-outer-wrapper active')) processor.decompose(article.find('header')) processor.decompose(article.find('footer')) text = processor.collect_text(article) return processor.create_dictionary('Kouvolan sanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") processor.decompose_all(soup.find_all(class_='pohja')) processor.decompose_all(soup.find_all(class_='footer_left')) processor.decompose_all(soup.find_all(class_='keski_footer')) processor.decompose_all(soup.find_all(class_='right_footer')) processor.decompose_all(soup.find_all(class_='sitaatti')) categories = [processor.collect_text(soup.find(class_='vinjetti'))] processor.decompose_all(soup.find_all(class_='vinjetti')) datetime_list = processor.collect_datetime( soup.find(class_='datetime').parent.parent) datetime_list.reverse() author = processor.collect_text(soup.find(class_='text-editor')) title = processor.collect_text(soup.find(class_='otsikko')) ingress = processor.collect_text(soup.find(class_='alarivi')) processor.decompose_all(soup.find_all(class_='alarivi')) text = '' for paragraph in soup.find_all(class_='teksti'): paragraph_text = processor.collect_text(paragraph) if paragraph_text not in text: text = text + ' ' + paragraph_text text = text.strip() img_div = soup.find(class_='pikkukuva') images = [u''] captions = [u''] if img_div: header_img = img_div.find_all('img') images = processor.collect_images(header_img, 'data-aghref', 'http://www.suomenmaa.fi/') captions = [header_img[0]['alt']] return processor.create_dictionary('Suomenmaa', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article__full') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) categories = processor.collect_categories( article.find_all(class_='article__meta__category')) title = processor.collect_text(article.find(class_='medium-title')) datetime_list = processor.collect_datetime( article.find(class_='article__meta__timestamp')) author = processor.collect_text(article.find(class_='author__name')) ingress = processor.collect_text(article.find(class_='lead')) text = '' for string in article.find_all('p'): text += ' ' + processor.collect_text(string) text = text.strip() images = processor.collect_images(article.find_all('img'), 'src', 'http://www.ilkka.fi') captions = [] for caption_element in article.find_all( lambda tag: tag.name == 'a' and 'data-caption' in tag.attrs): captions.append(caption_element['data-caption']) return processor.create_dictionary('Ilkka', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='node-wrap') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='kredIso')) processor.decompose_all(article.find_all(class_='tyrkkyBox')) processor.decompose(article.find(class_='avainsanat')) processor.decompose(article.find(class_='twitter-share-button')) processor.decompose(article.find(class_='fb-like')) processor.decompose(article.find(class_='moreLanka')) processor.decompose(article.find(class_='kredIso')) processor.decompose(article.find('cite')) meta = article.find(class_='juttutiedot') datetime_list = processor.collect_datetime(meta, ) author = processor.collect_text(meta.find(class_='author')) processor.decompose(meta) title = processor.collect_text(article.find('h2'), True) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='kuvaTekstiIso')) processor.decompose_all(article.find_all(class_='kuvaTekstiIso')) processor.decompose_all(article.find_all('figcaption')) text = processor.collect_text(article) return processor.create_dictionary('Vihreä lanka', url, r.status_code, [u''], datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='region bottom')) processor.decompose( article.find(class_='field-name-field-related-content')) categories = processor.collect_categories( article.find_all(class_='field-name-field-category')) datetime_list = processor.collect_datetime( article.find(class_='field-name-post-date'), 'timedate') author = processor.collect_text( article.find(class_='field-name-field-author')) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text( article.find(class_='field-name-field-summary')) text = processor.collect_text(article.find(class_='field-name-field-body')) images = [] for img in processor.collect_images(article.find_all('img'), 'src', ''): if 'placeholder' not in img: images.append(img) captions = processor.collect_image_captions( article.find_all(class_='file-image-description-caption')) return processor.create_dictionary('Hyvä terveys', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse_from_archive(url, content): article = BeautifulSoup(content, "html.parser") if article == None: return processor.create_dictionary('Kauppalehti', url, 404, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) meta = article.find(class_='hakutuloslahde') domain = 'Kauppalehti' if 'online' in meta.text: domain += ' Online' datetime_list = processor.collect_datetime(meta) if ',' in meta.text: categories = [processor.collect_text(meta).split(',')[1].strip()] else: categories = [u''] author = processor.collect_text(article.find(class_='signeeraus')) title = processor.collect_text(article.find(class_='otsikko')) ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1]) ingress += ' ' + processor.collect_text(article.find(class_='esirivi')) ingress = ingress.strip() text_divs = article.find_all(class_='artikkelip') text = '' for text_content in text_divs: text += processor.collect_text(text_content) + ' ' text = processor.process(text.strip()) text += processor.collect_text(article.find(class_='korjaus')) captions = processor.collect_image_captions( article.find_all(class_='kuva')) return processor.create_dictionary(domain, url, 200, categories, datetime_list, author, title, ingress, text, [u''], captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='keywords-block')) processor.decompose_all(article.find_all(class_='share-buttons-block')) processor.decompose(article('p')[-1]) processor.decompose(article.footer) processor.decompose(article.find(class_='wp-user-avatar')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_data = article.find(class_='single-post-date') processor.decompose(datetime_data.find(class_='category')) datetime_list = processor.collect_datetime(datetime_data) processor.decompose(article.find(class_='single-post-date')) author = processor.collect_text( article.find(class_='post-author').find('li')) title = processor.collect_text(article.find(class_='entry-title')) text = processor.collect_text(article.find(class_='post-content')) images = processor.collect_images(article.find_all('img'), 'src', 'https://demokraatti.fi') return processor.create_dictionary('Demokraatti', url, r.status_code, categories, datetime_list, author, title, u'', text, images, [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='posted-on')) author = author.replace(' |', '') processor.decompose(article.find(class_='entry-meta')) title = processor.collect_text(article.find(class_='entry-title')) ingress = processor.collect_text( article.find(class_='entry-content__ingress')) processor.decompose(article.find(class_='entry-content__ingress')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='entry-header__caption')) text = processor.collect_text(article.find(class_='entry-content')) return processor.create_dictionary('Verkkouutiset', url, r.status_code, [u''], datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all( article.find_all(class_='views-field-field-aamuset-related-images')) categories_element = soup.find(class_='tsv3-c-as-articletags') categories = processor.collect_categories( categories_element.find_all('li')) datetime_list = processor.collect_datetime(article.find('time')) author = processor.collect_text(article.find(class_='kirjoittaja')) processor.decompose(article.find(class_='kirjoittaja')) title = processor.collect_text(article.find(class_='otsikko')) text = processor.collect_text( article.find(class_='tsv3-c-as-article__textitem--teksti')) images = processor.collect_images(article.find_all('img'), 'src', 'http://www.aamuset.fi') captions = processor.collect_image_captions( article.find_all(class_='tsv3-c-as-article__attachment__caption')) return processor.create_dictionary('Aamuset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse_from_archive(url, content): article = BeautifulSoup(content, "html.parser") if article == None: return processor.create_dictionary('Satakunnan kansa', url, 404, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) meta = article.find(class_='hakutuloslahde') datetime_list = processor.collect_datetime(meta) category = processor.collect_text(meta).split(',')[1].strip() subcat = processor.collect_text(article.find(class_='jalkirivi')) categories = [] for c in [category, subcat]: if c: categories.append(c) author = processor.collect_text(article.find(class_='signeeraus')) title = processor.collect_text(article.find(class_='otsikko')) ingress = processor.collect_text(article.find_all(class_='jalkirivi')[1]) ingress += ' ' + processor.collect_text(article.find(class_='esirivi')) ingress = ingress.strip() text_divs = article.find_all(class_='artikkelip') text = '' for text_content in text_divs: text += processor.collect_text(text_content) + ' ' text = processor.process(text.strip()) text += processor.collect_text(article.find(class_='korjaus')) captions = processor.collect_image_captions( article.find_all(class_='kuva')) return processor.create_dictionary('Satakunnan kansa', url, 200, categories, datetime_list, author, title, ingress, text, [u''], captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article-content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='related-articles-container')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_data = article.find(class_='post-meta') processor.decompose(datetime_data.find(class_='category')) processor.decompose(datetime_data.find(class_='updated')) datetime_list = processor.collect_datetime(datetime_data) author = processor.collect_text(article.find(class_='author--main')) title = processor.collect_text(article.find(class_='heading--main')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='caption')) processor.decompose_all(article.find_all(class_='image-wrapper')) text = processor.collect_text(article.find(class_='content--main')) return processor.create_dictionary('Aamulehti', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='ad')) processor.decompose_all(article.find_all(class_='ad-container')) processor.decompose_all(article.find_all('style')) processor.decompose(article.find(id='fullWidthBottom')) categories = processor.collect_categories( article.find_all(class_='article-category')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='author-name')) title = processor.collect_text(article.find(class_='article-title')) ingress = processor.collect_text(article.find(class_='lead-paragraph')) text = processor.collect_text(article.find(class_='editorial')) images = processor.collect_images_by_parent( article.find_all(class_='img-container'), 'http:') captions = processor.collect_image_captions( article.find_all(class_='figcaption')) return processor.create_dictionary('Mtv', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) categories = processor.collect_categories( article.find_all(class_='article-release-info__section')) datetime_list = processor.collect_datetime( article.find(class_='article-release-info__time')) author = processor.collect_text(article.find(itemprop='author')) title_div = article.find(class_='article-single-heading') title = processor.collect_text(title_div.find('h1')) ingress = processor.collect_text(title_div.find('p')) text = processor.collect_text( article.find(class_='article-single-section__content')) images = processor.collect_images(article.find_all('img'), 'src', 'http://www.maaseuduntulevaisuus.fi') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Maaseudun tulevaisuus', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)