def extract_content(url): content = {} content = get_static_content() try: ua = UserAgent() config = Config() config.browser_user_agent = ua.chrome config.language = 'es' article = Article(url, config= config) article.download() article.parse() text = article.text content['text'] = text top_image = article.top_image content['image'] = top_image movielinks = [] for movie in article.movies: movielinks.append(movie) content['videos'] = movielinks except Exception as e: print_exc(e) return content
def write_article(self): if(self.message['data']==1L): return collection = self.db['articles'] data = json.loads(self.message['data']) articles = data['articles'] payloads = [] count = 0; for article_url in articles: article = Article(article_url) article.build() payload = {} payload['meta_keywords'] = article.meta_keywords payload['title'] = article.title payload['url'] = article.url payload['text'] = article.text payload['html'] = article.html payload['keywords'] = article.keywords payload['_id'] = str(hashlib.sha1(article.title).hexdigest()) payload['crawled_at'] = str(int(time.time())) payloads.append(payload) count+=1 if(count%100==0): collection.insert_many(payloads) payloads = [] if payloads: collection.insert_many(payloads)
def parse_article(self, response): # utilize newspaper for article parsing article = Article(url=response.url, config=self.config) article.set_html(response.body) article.parse() item = Art() item['title'] = article.title item['url'] = article.url item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' '))) yield item
def parse_article(self, response): # utilize newspaper for article parsing article = Article(url=response.url, config=self.config) article.set_html(response.body) article.parse() item = Art() item["title"] = article.title item["url"] = article.url item["text"] = "\n".join(nlp.split_sentences(article.text.replace("\n", " "))) yield item
def get_summary(self, title, text): article = Article(url='') article.title = title article.text = text article.download_state = ArticleDownloadState.SUCCESS article.is_parsed = True article.nlp() return self.preprocess_text(article.summary)
def parse_article(self, response): if len(response.body) > 0: # utilize newspaper for article parsing article = Article(url=response.url, config=self.config) article.set_html(response.body) article.parse() #self.sentences.append(nlp.split_sentences(article.text)) item = Art() item['title'] = article.title item['url'] = article.url item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' '))) yield item else: print response.url + ' DEAD LINK'
def read_article(url): article = Article(url) article.download() article.parse() article.nlp() print(article.text) # print 'SUMMARY' print(article.summary)
def parse(self, response): #print type(response) article = None try: article = NewsPlease.from_html(response.body.encode("utf-8")) except: article = NewsPlease.from_html( response.body.decode('latin-1').encode("utf-8")) print "EXCEPTION OCCURED" print article.date_publish #print article.text article2 = Article(url="", language="es") article2.set_html(response.body) article2.parse() print response.url self.db.articles_es.insert({ "title": article.title, "pub_date": article.date_publish, "url": response.url, "content": article2.text, "raw_html": response.body }) links = self.linkExtractor.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse)
def get_article(url): article = Article(url, language='pt') # Simples parsing de url e usando o requests para # fazer download do html como texto, fazer um simples # cleance desse html e retornar o texto completo # para depois gerar um Parser com esse texto article.download() # Analysing article.parse() return article
def crawl_today(): """ 每天定时爬取, 5小时一次即可,每个类别爬取一页 """ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "BEGIN") while True: try: conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean') except OperationalError as e: print(e) time.sleep(3) else: break for class_1 in categories: params['tag'] = class_1 params['pgno'] = str(1) page_url = base_url + urlencode(params) failure = 0 while failure < 3: try: doc = pq(page_url, headers=headers, verify=False) except Exception as e: failure += 1 print('\r获取新闻链接失败,原因:', e, end='', flush=True) else: break else: continue ul = doc('ul.timeline > li') for li in ul.items(): url = li.find('h2 a').attr('href') article = Article(url) try: article.download() article.parse() except ArticleException as e: # print(e) continue content = article.text if content: title = article.title date = article.publish_date class_2 = li.find('div.timeline-content > a').text() # print(title) cursor = conn.cursor() sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 8)) cursor.execute(sql, ('Malaysia', domain, class_1, class_2, title, date, content, url)) conn.commit() cursor.close() conn.close() now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "DONE")
def parseArticle(articles: ResultSet, host: str, src_ele: str, summary_ele: str, date_ele: str, url_ele: str): global parse_done global config_en global articles_count articles_count += len(articles) for a in articles: src = a.find(src_ele) summary = a.find(summary_ele) date = a.find(date_ele) if src is None: src = host else: src = src.text if summary is None: summary = a.find('description') # fallback if summary is not None: summary = summary.text url = a.find(url_ele) if url is not None: url = url.text.strip() else: url = '' if url != '': article = Article(url, config=config_en) if date is not None: try: date = parse(date.text) except: date = None try: article.download() article.parse() except Exception as ex: log(f'{ex}, url is "{url}"') finally: if article.publish_date is datetime and date is None: date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S') insert_db((src, date, article.title, summary, article.text, article.url)) parse_done += 1
def crawl_archive(): """ 爬取过去一周的所有新闻 """ conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean') for c in categories: for i in range(1, MAX_PAGE + 1): params['tag'] = c params['pgno'] = str(i) page_url = base_url + urlencode(params) try: doc = pq(page_url, headers=headers, verify=False) except requests.exceptions.ConnectionError as e: print(e) doc = pq(page_url, headers=headers, verify=False) ul = doc('ul.timeline > li') for li in ul.items(): url = li.find('h2 a').attr('href') article = Article(url) try: article.download() article.parse() except ArticleException as e: print(e) continue title = article.title date = article.publish_date content = article.text class_2 = li.find('div.timeline-content > a').text() if content: print(title) cursor = conn.cursor() sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 9)) cursor.execute(sql, ('Malaysia', domain, c, class_2, None, title, date, content, url)) conn.commit() cursor.close() conn.close()
db_connect.init_database_cont() ''' we process homepage''' for home_page in washington_post_home_pages: print("extracting: " + home_page) washington_page = requests.get(home_page) html_tree = html.fromstring(washington_page.text) article_urls = html_tree.xpath('//a/@href') for home_url in article_urls: if home_url is not None and len(home_url) > 16: if ('http://' not in home_url and 'https://' not in home_url): home_url = WASHINGTON_POST + home_url try: article_home = Article(home_url, keep_article_html=True) extract_washington_post_article(article_home, True, washington_post_home_pages.get(home_page)) except Exception as e: print('Smt wrong when process homepage' + home_page + 'article: {}'.format(e) + home_url) db_connect.close_database_cont() except Exception as e: print('Something went wrong with database: {}'.format(e))
def download_and_parse(article: Article): try: article.download() article.parse() except newspaper.article.ArticleException: pass
def fetch_story(request): if request.method == 'GET': # List to store all the parsed RSS entries. story_list = [] # Get Source Object from 'item_id' passed through Request source_id = request.GET.get('item_id') if source_id is None: # If none, Return to sources list return HttpResponseRedirect('/sources_list/') # Get sourcing object try: rss_obj = Sourcing.objects.get(id=source_id) except Sourcing.DoesNotExist: messages.info(request, 'Source Does Not Exist, Please try another one.') return HttpResponseRedirect('/sources_list/') # Parse the RSS URL and get the data feed_data = feedparser.parse(rss_obj.rss_url) # Detects if the Url is not well formed RSS if feed_data.bozo == 1: url_error = { 'Possible Wrong URL. Click here to go back to Sources page.' } return render_to_response('fetch_story.html', { 'url_error': url_error, 'user': request.user }) else: for data in feed_data.get('entries'): story_url = data.get('link') # If RSS is Empty return Story listing page if story_url is None: rss_error = { 'Either RSS is empty or RSS is broken. Click here to go back to Story Listing page' } return render_to_response('fetch_story.html', { 'rss_error': rss_error, 'user': request.user }) # Use newspaper library to download the article article = Article(story_url) try: article.download() except ArticleException: logger.debug("Article Download exception in : %s" % story_url) # Try to Parse Article try: article.parse() except ArticleException: logger.debug("Exception in article parse") article_instance = article # if Datetime is none, assign current datetime if article_instance.publish_date is None: if data.get('published') is None: article_instance.publish_date = datetime.now( ).strftime('%Y-%m-%d %H:%M:%S') else: article_instance.publish_date = datetime.strptime( data.get('published'), '%a, %d %b %Y %H:%M:%S GMT').strftime( '%Y-%m-%d %H:%M:%S') # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S') elif not isinstance(article_instance.publish_date, datetime): article_instance.publish_date = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S') # if Body is empty, assign dummy Text if article_instance.text is '': article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \ Click the Story title to visit the Story page." try: # Check if story exist Stories.objects.select_related('source').get(url=story_url) except Stories.DoesNotExist: story = Stories(title=article_instance.title, source=rss_obj, pub_date=article_instance.publish_date, body_text=article_instance.text, url=article_instance.url) story.save() # Add each downloaded article details to Story_list and pass to HTML template. story_list += [article_instance] return render_to_response('fetch_story.html', { 'data': story_list, 'rss_id': rss_obj, 'user': request.user }) else: return HttpResponseRedirect('/sources_list/')
def handle(self, *args, **options): source_obj = Sourcing.objects.all() stories_list = list(Stories.objects.values_list('url', flat=True)) # To store time and data iterated count not_rss_url = 0 fetched_story_count = 0 existing_story_count = len(stories_list) download_exception = 0 parsing_exception = 0 broken_rss_list = 0 print("""\n\n ------------------------Started fetching Url's:------------------------ \n """) start_time = datetime.now() sources = tqdm(source_obj) for list_item in sources: # Sources Progress bar sources.set_description('Source Completed ') # Parse data from Rss Url feed_data = feedparser.parse(list_item.rss_url) # Detects if the Url is not well formed RSS if feed_data.bozo == 1: logger.debug("Not a RSS url : %s" % list_item.rss_url) not_rss_url += 1 else: # Stories progess bar using tqdm story_entries = tqdm(feed_data.get('entries')) """ # This will iterate through each story url # If story url is already in list fetched from DB # It will not fetch for those URL. # Else: It will download the story and save to Stories DB """ for data in story_entries: # Stories Progress bar Title story_entries.set_description('Stories Completed ') # Get story Url from story_entries list story_url = data.get('link') # If RSS is Empty return to Story listing page if story_url is None: logger.debug("No feed data in RSS URL: %s" % list_item.rss_url) broken_rss_list += 1 else: # If story does not exist, It'll download and save it in database if story_url in stories_list: stories_list.remove(story_url) else: # Use Newspaper Library's article = Article(story_url) # Use newspaper library to download the article try: article.download() except ArticleException: logger.debug("Article Download exception in : %s" % story_url) download_exception += 1 # Parse Article try: article.parse() except ArticleException: logger.debug("Article parse exception in : %s" % story_url) parsing_exception += 1 article_instance = article # if Datetime is none or not a Datetime, assign current datetime if article_instance.publish_date is None: if data.get('published') is None: article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') else: article_instance.publish_date = datetime.strptime( data.get('published'), '%a, %d %b %Y %H:%M:%S GMT' ).strftime('%Y-%m-%d %H:%M:%S') elif not isinstance(article_instance.publish_date, datetime): article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if Body is empty, assign dummy Text if article_instance.text is '': article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \ Click the Story title to visit the Story page." # Save story. story = Stories( title=article_instance.title, source=list_item, pub_date=article_instance.publish_date, body_text=article_instance.text, url=article_instance.url ) story.save() fetched_story_count += 1 stop_time = datetime.now() execution_time = stop_time - start_time final_count = len(Stories.objects.values_list('url', flat=True)) print(""" ------------------------Finished fetching Url's:------------------------ Final Result: No of Existing Stories : {0} No of New Stories Fetched : {1} No of wrong Rss Url's : {2} No of Broken or Empty Rss Url's : {3} No of Stories not Downloaded : {4} No of Stories not Parsed : {5} ------------------------------------------------- Total Stories : {6} ------------------------------------------------- Process Execution time : {7} ------------------------------------------------------------------------ """.format(existing_story_count, fetched_story_count, not_rss_url, broken_rss_list, download_exception, parsing_exception, final_count, execution_time))
''' Created on Feb 28, 2015 @author: hoavu ''' from newspaper.article import Article url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html' article = Article(url) article.download() article.parse() #print(article.html) print(article.text)
import re from newspaper.article import Article import newspaper # url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' # # article = Article(url) # # article.parse() # article.authors # article.text # cnn_paper = newspaper.build('http://cnn.com') url = 'http://news.163.com/17/0312/10/CFAP3Q9G000189FH.html' a = Article(url, language='zh') # Chinese a.download() a.parse() print(a.keywords) print("===============") print(a.title) print("===============") print(a.authors) print("===============") print(a.text[:150]) # filter_regex = re.compile(r'[^a-zA-Z0-9\ ]') # title_text_h1 = "我与总书记议国是:建设社会稳定长治久安新AA边疆,总书记 社会稳定 全国人大代表00000ddd" # filter_title_text_h1 = filter_regex.sub('', title_text_h1).lower()
@author: hoavu ''' import requests import urllib.parse import nltk from newspaper.article import Article from crawlerApp.utils import get_cosine, text_to_vector, normalize_text '''get stop words first ''' with open ("stopwords_en.txt", "r") as myfile: stopwords=myfile.read().replace('\n', '') normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html' article1 = Article(normalized_url) article1.download() """ 0.62 is good threshold""" normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512' article2 = Article(normalized_url2) article2.download() print("download finished") article1.parse() string1 = article1.text article2.parse() string2 = article2.text
def getArticleTitleText(url): article = Article(url) article.download() article.html article.parse() return [article.title, article.text.encode('utf-8')]
# try: # article_home = Article(home_url) # extract_vnexpress_article(article_home, True, 'expressing') # except Exception as e: # print('Smt wrong when process homepage expressing article: {}'.format(e) + home_url) ''' we process homepage''' VNEXPRESS_HOMPAGE = 'http://vnexpress.net/' vnexpress_homepage = requests.get(VNEXPRESS_HOMPAGE) html_tree = html.fromstring(vnexpress_homepage.text) article_urls = html_tree.xpath('//a/@href') for home_url in article_urls: if home_url is not None and len(home_url) > 16: if ('http://' not in home_url and 'https://' not in home_url): home_url = VNEXPRESS_HOME + home_url try: article_home = Article(home_url) extract_vnexpress_article(article_home, True) except Exception as e: print( 'Smt wrong when process homepage article: {}'.format(e) + home_url) db_connect.close_database_cont() except Exception as e: print('Something went wrong with database: {}'.format(e)) ''' ======================================================================================================================================= ======================================================================================================================================= ================================================= VNEpress stop ====================================================================== ======================================================================================================================================= =======================================================================================================================================
from typing import List from newspaper.article import Article from kindle_news_assistant.agent import Agent @pytest.fixture def agent(): return Agent() @pytest.mark.parametrize( "articles,expected_len", [ ( [ Article("https://cnn.com/0/16/article-title.html", "https://cnn.com"), Article( "https://cnn.com/0/16/article-title.html?query=yes", "https://cnn.com", ), Article("https://cnn.com/1/16/different-title.html", "https://cnn.com"), ], 2, ), ], ) def test_filter_duplicates(agent: Agent, articles: List[Article], expected_len: int): filtered = agent.filter_duplicates(articles) assert len(filtered) == expected_len
def crawl_today(): """ 用于每日更新 """ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "BEGIN", flush=True) while True: try: conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean') except OperationalError as e: print(e) time.sleep(3) else: break for key in class_.keys(): class_1 = class_[key] # print(class_1) page = 1 while page < 3: page_url = base_url + key + '&max_page=' + str(page) # print(page_url) failure = 0 while failure < 3: try: doc = pq(page_url, headers=headers, verify=False) except Exception as e: failure += 1 print(e) else: break else: continue a_list = doc('div.w3-justify a') for a in a_list.items(): news_url = 'http://www.bernama.com/en/' + a.attr('href') article = Article(news_url) try: article.download() article.parse() except ArticleException as e: print(e) continue content = pattern.sub('', article.text).replace('\n', '') if content: url = article.url title = article.title try: date = '-'.join( pattern.findall(article.html)[0].split('/')[::-1]) except: date = '' # print(title, date, content, sep='\n') cursor = conn.cursor() sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join( ['%s'] * 8)) cursor.execute(sql, ('Malaysia', domain, class_1, None, title, date, content, url)) conn.commit() cursor.close() if len(a_list) < 7: break page = page + 1 conn.close() now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "DONE")
@author: hoavu ''' import requests import urllib.parse import nltk from newspaper.article import Article from crawlerApp.utils import get_cosine, text_to_vector, normalize_text '''get stop words first ''' with open("stopwords_en.txt", "r") as myfile: stopwords = myfile.read().replace('\n', '') normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html' article1 = Article(normalized_url) article1.download() """ 0.62 is good threshold""" normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512' article2 = Article(normalized_url2) article2.download() print("download finished") article1.parse() string1 = article1.text article2.parse() string2 = article2.text normalised_string1 = normalize_text(string1)