def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'mainArticle-content-wrapper' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) header = article.find( id = 'main-article-header' ) categories = processor.collect_categories( header.find_all( class_ = 'section' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'article-date' ) ) author = processor.collect_text( article.find( class_ = 'authorName' ) ) title = processor.collect_text( article.find( class_ = 'main-article-header' ) ) text = processor.collect_text( article.find( class_ = 'body' ) ) processor.decompose( article.find( class_ = 'authorPicture' ) ) processor.decompose( article.find( id = 'main-subscribe' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'main-media-caption' ) ) return processor.create_dictionary('Etelä-Suomen Sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'iso-8859-1' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( id = 'container_keski' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose( article.find( class_ = 'kp-share-area' ) ) categories = processor.collect_categories( soup.find_all( class_ = 'sel' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'juttuaika' ) ) author_div = article.find( class_ = 'author' ) processor.decompose( author_div.find( 'a' ) ) author = processor.collect_text( author_div, True ) title = processor.collect_text( article.find( 'h1' ) ) ingress = processor.collect_text( article.find( class_ = 'ingressi' ), True ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'kuvateksti' ) ) processor.decompose_all( article.find_all( class_ = 'kuvamiddle' ) ) text = processor.collect_text( article.find( 'isense' ) ) return processor.create_dictionary('Iltalehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('main') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='nosto')) links = article.find(class_='links') categories = processor.collect_categories(links.find_all('li')) datetime_list = processor.collect_datetime( article.find(class_='field-name-field-publish-date')) author = processor.collect_text(article.find(class_='tekija')) title = processor.collect_text(article.find(id='page-title')) text = processor.collect_text(article.find(class_='body')) images = processor.collect_images( article.find(class_='views-field-field-op-main-image').find_all('img'), 'src', '') return processor.create_dictionary('Tiedonantaja', url, r.status_code, categories, datetime_list, author, title, u'', text, images, [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(id='main-content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='reviewpic')) datetime_list = processor.collect_datetime( article.find(class_='published')) author = processor.collect_text(article.find(class_='author')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='entry-content')) images = processor.collect_images(article.find_all('img'), 'src', '') return processor.create_dictionary('Faktabaari', url, r.status_code, [u''], datetime_list, author, title, u'', text, images, [u''])
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'region-content-inner' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find_all( 'noscript' ) ) processor.decompose( article.find( id = 'comments' ) ) processor.decompose( article.find( class_ = 'contributor' ) ) processor.decompose( article.find( class_ = 'field-name-field-author-image' ) ) categories = processor.collect_categories( article.find_all( class_ = 'field-name-field-category' ) ) datetime_list = processor.collect_datetime_objects( article.find_all( class_ = 'date-display-single' ), 'content' ) author = processor.collect_text( article.find( class_ = 'author-name' ) ) title = processor.collect_text( article.find( id = 'page-title' ) ) text = processor.collect_text( article.find( class_ = 'field-name-body' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'field-name-field-image-description' ) ) return processor.create_dictionary('Uusi Suomi', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find_all( class_ = 'somebar' ) ) processor.decompose( article.find( class_ = 'tags' ) ) categories = processor.collect_categories( article.find_all( class_ = 'post-category' ) ) datetime_string = article.find( class_ = 'timestamp' ).get_text( ' ', strip = True ) datetime_string = processor.convert_month( datetime_string.replace( ',', '' ) ) datetime_list = [datetime.strptime( datetime_string, '%m %d %Y %H:%M' )] author = processor.collect_text( article.find( class_ = 'article-page-writer' ), True ) title = processor.collect_text( article.find( class_ = 'post-title' ) ) text = processor.collect_text( article.find( class_ = 'post-content' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( 'figcaption' ) ) return processor.create_dictionary('Suomen uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( class_ = 'main-content-area' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) meta = article.find( class_ = 'post-meta' ) categories = processor.collect_categories( meta.find_all( class_ = 'category' ), True ) datetime_list = processor.collect_datetime( meta, 'datetime date' ) author = processor.collect_text( article.find( class_ = 'author--main' ) ) title = processor.collect_text( article.find( class_ = 'heading--main' ) ) ingress= processor.collect_text( article.find( class_ = 'heading--secondary' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'caption' ) ) processor.decompose_all( article.find_all( class_ = 'image-wrapper' ) ) text = processor.collect_text( article.find( class_ = 'content--main' ) ) return processor.create_dictionary('Lapin kansa', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse( url ): r = requests.get( url ) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup( r.text, "html.parser" ) article = soup.find( 'article' ) if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all( article.find_all( 'script' ) ) processor.decompose_all( article.find( 'header' ).find_all( 'img' ) ) processor.decompose_all( article.find_all( 'blockquote' ) ) processor.decompose( article.find( class_ = "meta-sidebar" ) ) categories = processor.collect_categories( article.find_all( class_ = 'cat' ) ) datetime_list = processor.collect_datetime( article.find( class_ = 'date' ) ) author = processor.collect_text( article.find( class_ = 'author' ) ) title = processor.collect_text( article.find( class_ = 'article-title' ) ) ingress = processor.collect_text( article.find( class_ = 'ingress' ), True ) text = processor.collect_text( article.find( class_ = 'content' ) ) images = processor.collect_images( article.find_all( 'img' ), 'src', '' ) captions = processor.collect_image_captions( article.find_all( class_ = 'featured-image' ) ) return processor.create_dictionary('Kd-lehti', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='single-article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='related-links')) processor.decompose_all(article.find_all(class_='article-ad-block')) categories = processor.collect_categories( soup.find(class_='section-title')) datetime_list = processor.collect_datetime(article.find('time')) author = processor.collect_text(article.find(class_='byline')) title = processor.collect_text(article.find(class_='article-title')) ingress = processor.collect_text(article.find(class_='ingress')) text = processor.collect_text(article.find(class_='body')) images = processor.collect_images(article.find_all('img'), 'src', 'http:') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Taloussanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") processor.decompose_all(soup.find_all(class_='pohja')) processor.decompose_all(soup.find_all(class_='footer_left')) processor.decompose_all(soup.find_all(class_='keski_footer')) processor.decompose_all(soup.find_all(class_='right_footer')) processor.decompose_all(soup.find_all(class_='sitaatti')) categories = [processor.collect_text(soup.find(class_='vinjetti'))] processor.decompose_all(soup.find_all(class_='vinjetti')) datetime_list = processor.collect_datetime( soup.find(class_='datetime').parent.parent) datetime_list.reverse() author = processor.collect_text(soup.find(class_='text-editor')) title = processor.collect_text(soup.find(class_='otsikko')) ingress = processor.collect_text(soup.find(class_='alarivi')) processor.decompose_all(soup.find_all(class_='alarivi')) text = '' for paragraph in soup.find_all(class_='teksti'): paragraph_text = processor.collect_text(paragraph) if paragraph_text not in text: text = text + ' ' + paragraph_text text = text.strip() img_div = soup.find(class_='pikkukuva') images = [u''] captions = [u''] if img_div: header_img = img_div.find_all('img') images = processor.collect_images(header_img, 'data-aghref', 'http://www.suomenmaa.fi/') captions = [header_img[0]['alt']] return processor.create_dictionary('Suomenmaa', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('blockquote')) processor.decompose(article.find(class_='sticky-inner-wrapper')) categories_list = soup.find(class_='breadcrumb').find_all('li')[1:-1] categories = processor.collect_categories(categories_list) datetime_list = processor.collect_datetime(article.find(class_='meta')) authors = article.find(class_='authors') author = '' for div in authors.find_all(class_='author'): author += processor.collect_text(div.find('p')) + ',' author = author[:-1] processor.decompose(authors) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text(article.find(class_='lead')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions(article.find_all('figcaption')) processor.decompose(article.find(class_='sticky-outer-wrapper active')) processor.decompose(article.find('header')) processor.decompose(article.find('footer')) text = processor.collect_text(article) return processor.create_dictionary('Kouvolan sanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article__full') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) categories = processor.collect_categories( article.find_all(class_='article__meta__category')) title = processor.collect_text(article.find(class_='medium-title')) datetime_list = processor.collect_datetime( article.find(class_='article__meta__timestamp')) author = processor.collect_text(article.find(class_='author__name')) ingress = processor.collect_text(article.find(class_='lead')) text = '' for string in article.find_all('p'): text += ' ' + processor.collect_text(string) text = text.strip() images = processor.collect_images(article.find_all('img'), 'src', 'http://www.ilkka.fi') captions = [] for caption_element in article.find_all( lambda tag: tag.name == 'a' and 'data-caption' in tag.attrs): captions.append(caption_element['data-caption']) return processor.create_dictionary('Ilkka', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='node-wrap') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='kredIso')) processor.decompose_all(article.find_all(class_='tyrkkyBox')) processor.decompose(article.find(class_='avainsanat')) processor.decompose(article.find(class_='twitter-share-button')) processor.decompose(article.find(class_='fb-like')) processor.decompose(article.find(class_='moreLanka')) processor.decompose(article.find(class_='kredIso')) processor.decompose(article.find('cite')) meta = article.find(class_='juttutiedot') datetime_list = processor.collect_datetime(meta, ) author = processor.collect_text(meta.find(class_='author')) processor.decompose(meta) title = processor.collect_text(article.find('h2'), True) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='kuvaTekstiIso')) processor.decompose_all(article.find_all(class_='kuvaTekstiIso')) processor.decompose_all(article.find_all('figcaption')) text = processor.collect_text(article) return processor.create_dictionary('Vihreä lanka', url, r.status_code, [u''], datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='region bottom')) processor.decompose( article.find(class_='field-name-field-related-content')) categories = processor.collect_categories( article.find_all(class_='field-name-field-category')) datetime_list = processor.collect_datetime( article.find(class_='field-name-post-date'), 'timedate') author = processor.collect_text( article.find(class_='field-name-field-author')) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text( article.find(class_='field-name-field-summary')) text = processor.collect_text(article.find(class_='field-name-field-body')) images = [] for img in processor.collect_images(article.find_all('img'), 'src', ''): if 'placeholder' not in img: images.append(img) captions = processor.collect_image_captions( article.find_all(class_='file-image-description-caption')) return processor.create_dictionary('Hyvä terveys', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='keywords-block')) processor.decompose_all(article.find_all(class_='share-buttons-block')) processor.decompose(article('p')[-1]) processor.decompose(article.footer) processor.decompose(article.find(class_='wp-user-avatar')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_data = article.find(class_='single-post-date') processor.decompose(datetime_data.find(class_='category')) datetime_list = processor.collect_datetime(datetime_data) processor.decompose(article.find(class_='single-post-date')) author = processor.collect_text( article.find(class_='post-author').find('li')) title = processor.collect_text(article.find(class_='entry-title')) text = processor.collect_text(article.find(class_='post-content')) images = processor.collect_images(article.find_all('img'), 'src', 'https://demokraatti.fi') return processor.create_dictionary('Demokraatti', url, r.status_code, categories, datetime_list, author, title, u'', text, images, [u''])
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='posted-on')) author = author.replace(' |', '') processor.decompose(article.find(class_='entry-meta')) title = processor.collect_text(article.find(class_='entry-title')) ingress = processor.collect_text( article.find(class_='entry-content__ingress')) processor.decompose(article.find(class_='entry-content__ingress')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='entry-header__caption')) text = processor.collect_text(article.find(class_='entry-content')) return processor.create_dictionary('Verkkouutiset', url, r.status_code, [u''], datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all( article.find_all(class_='views-field-field-aamuset-related-images')) categories_element = soup.find(class_='tsv3-c-as-articletags') categories = processor.collect_categories( categories_element.find_all('li')) datetime_list = processor.collect_datetime(article.find('time')) author = processor.collect_text(article.find(class_='kirjoittaja')) processor.decompose(article.find(class_='kirjoittaja')) title = processor.collect_text(article.find(class_='otsikko')) text = processor.collect_text( article.find(class_='tsv3-c-as-article__textitem--teksti')) images = processor.collect_images(article.find_all('img'), 'src', 'http://www.aamuset.fi') captions = processor.collect_image_captions( article.find_all(class_='tsv3-c-as-article__attachment__caption')) return processor.create_dictionary('Aamuset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article-content') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='related-articles-container')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_data = article.find(class_='post-meta') processor.decompose(datetime_data.find(class_='category')) processor.decompose(datetime_data.find(class_='updated')) datetime_list = processor.collect_datetime(datetime_data) author = processor.collect_text(article.find(class_='author--main')) title = processor.collect_text(article.find(class_='heading--main')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='caption')) processor.decompose_all(article.find_all(class_='image-wrapper')) text = processor.collect_text(article.find(class_='content--main')) return processor.create_dictionary('Aamulehti', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) categories = processor.collect_categories( article.find_all(class_='article-release-info__section')) datetime_list = processor.collect_datetime( article.find(class_='article-release-info__time')) author = processor.collect_text(article.find(itemprop='author')) title_div = article.find(class_='article-single-heading') title = processor.collect_text(title_div.find('h1')) ingress = processor.collect_text(title_div.find('p')) text = processor.collect_text( article.find(class_='article-single-section__content')) images = processor.collect_images(article.find_all('img'), 'src', 'http://www.maaseuduntulevaisuus.fi') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Maaseudun tulevaisuus', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find('footer')) processor.decompose_all(article.find_all(class_='cb-module-title')) processor.decompose_all(article.find_all('blockquote')) processor.decompose_all(article.find_all('aside')) categories = processor.collect_categories( article.find_all(class_='cb-category')) datetime_list = processor.collect_datetime(article.find(class_='cb-date')) author = processor.collect_text(article.find(class_='cb-author')) title = processor.collect_text(article.find(class_='entry-title')) ingress = processor.collect_text( article.find(class_='cb-entry-content').find('h4'), True) text = processor.collect_text(article.find(class_='cb-entry-content')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='caption')) return processor.create_dictionary('Kansan uutiset', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='post-single') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='avatar')) categories = processor.collect_categories( article.find_all(itemprop='articleSection')) datetime_list = processor.collect_datetime( article.find(itemprop='dateCreated datePublished')) author = processor.collect_text(article.find(rel='author')) title = processor.collect_text(article.find(itemprop='headline')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='sopuli-image-caption')) processor.decompose_all(article.find_all(itemprop='associatedMedia')) text = processor.collect_text(article.find(itemprop='articleBody')) return processor.create_dictionary('Kokemäenjokilaakson uutiset', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='post') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all('blockquote')) processor.decompose(article.find(class_='author-avatar')) processor.decompose( article.find(id='after-single-post-widget-zone-single-post')) processor.decompose(article.find(id='sidebar')) categories = processor.collect_categories( article.find_all(class_='category')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(itemprop='name')) title = processor.collect_text(article.find(class_=' xt-post-title')) text = processor.collect_text(article.find(class_='post-body')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Mahorkka', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article-body') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) departments = article.find(class_='departments') categories = processor.collect_categories(departments.find_all('a')) datetime_list = processor.collect_datetime_objects( article.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='author')) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text(article.find(class_='ingress')) # This does not get the text because HBL demands registration text = processor.collect_text(article.find(class_='text')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='ksf-image-meta')) return processor.create_dictionary('Hufvudstadsbladet', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") root = soup.find(id='root') article_container = root.contents[0].contents[1].contents[3] article = article_container.contents[0].contents[2].contents[2] if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('aside')) categories = processor.collect_categories([article.find('h4')]) datetime_list = processor.collect_datetime(article.contents[0]) title = processor.collect_text(article.find('h1')) text_section = article.find('section') ingress = processor.collect_text(text_section.find('h3')) text_container = text_section.contents[0].contents[5] text = processor.collect_text(text_container) images = processor.collect_images([article.find('img')], 'src', '') captions = processor.collect_image_captions(article.find_all('figcaption')) return processor.create_dictionary('Talouselämä', url, r.status_code, categories, datetime_list, u'', title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='single-article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='print-url')) processor.decompose_all(article.find_all(class_='article-ad-block')) category = url.split('/')[3] categories = [category.capitalize().encode('utf8')] datetime_list = processor.collect_datetime( article.find(itemprop='datePublished')) author = processor.collect_text(article.find(itemprop='author')) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text(article.find(class_='ingress')) text = processor.collect_text(article.find(class_='body')) images = processor.collect_images(article.find_all('img'), 'src', 'http:') captions = processor.collect_image_captions( article.find_all(itemprop='caption')) return processor.create_dictionary('Iltasanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) meta = article.find(class_='tsv3-c-common-article__meta__row1') categories = processor.collect_categories(meta.find_all('a')) datetime_list = processor.collect_datetime_objects(meta.find_all('time'), 'datetime') author = processor.collect_text(article.find(class_='kirjoittaja')) title = processor.collect_text(article.find(class_='otsikko')) text = processor.collect_text( article.find(class_='tsv3-c-common-article__textitem--teksti')) images = processor.collect_images(article.find_all('img'), 'src', 'http://www.ts.fi') captions = processor.collect_image_captions( article.find_all(class_='tsv3-c-common-article__attachment__caption')) return processor.create_dictionary('Turun sanomat', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find(class_='article-container') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose(article.find(class_='article__related')) processor.decompose_all( article.find_all(class_='smartblock--juttusivu-markkinointi')) meta = article.find(class_='news__meta') categories = [processor.collect_text(meta).split(' ')[0]] datetime_list = processor.collect_datetime(meta) author = processor.collect_text(meta.find(class_='news__source')) title = processor.collect_text(article.find('h1')) text = processor.collect_text(article.find(class_='article__text')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='image__caption')) return processor.create_dictionary('Kaleva', url, r.status_code, categories, datetime_list, author, title, u'', text, images, captions)
def parse(url): r = requests.get(url) if r.status_code == 404: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) r.encoding = 'UTF-8' soup = BeautifulSoup(r.text, "html.parser") article = soup.find('article') if article == None: return processor.create_dictionary('', url, r.status_code, [u''], [u''], u'', u'', u'', u'', [u''], [u'']) processor.decompose_all(article.find_all('script')) processor.decompose_all(article.find_all(class_='attImage')) meta = article.find('time') categories = processor.collect_categories(meta.find_all('b')) datetime_list = processor.collect_datetime(meta) author = processor.collect_text(article.find(class_='Kirjoittaja'), True) title = processor.collect_text(article.find('h1')) ingress = processor.collect_text(article.find(class_='Alaotsikko')) text = processor.collect_text(article.find(class_='Teksti')) images = processor.collect_images(article.find_all('img'), 'src', '') captions = processor.collect_image_captions( article.find_all(class_='featuredCaption')) return processor.create_dictionary('Kainuun sanomat', url, r.status_code, categories, datetime_list, author, title, ingress, text, images, captions)