def extract_content(url): content = {} content = get_static_content() try: ua = UserAgent() config = Config() config.browser_user_agent = ua.chrome config.language = 'es' article = Article(url, config= config) article.download() article.parse() text = article.text content['text'] = text top_image = article.top_image content['image'] = top_image movielinks = [] for movie in article.movies: movielinks.append(movie) content['videos'] = movielinks except Exception as e: print_exc(e) return content
def read_article(url): article = Article(url) article.download() article.parse() article.nlp() print(article.text) # print 'SUMMARY' print(article.summary)
def get_article(url): article = Article(url, language='pt') # Simples parsing de url e usando o requests para # fazer download do html como texto, fazer um simples # cleance desse html e retornar o texto completo # para depois gerar um Parser com esse texto article.download() # Analysing article.parse() return article
def crawl_today(): """ 每天定时爬取, 5小时一次即可,每个类别爬取一页 """ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "BEGIN") while True: try: conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean') except OperationalError as e: print(e) time.sleep(3) else: break for class_1 in categories: params['tag'] = class_1 params['pgno'] = str(1) page_url = base_url + urlencode(params) failure = 0 while failure < 3: try: doc = pq(page_url, headers=headers, verify=False) except Exception as e: failure += 1 print('\r获取新闻链接失败,原因:', e, end='', flush=True) else: break else: continue ul = doc('ul.timeline > li') for li in ul.items(): url = li.find('h2 a').attr('href') article = Article(url) try: article.download() article.parse() except ArticleException as e: # print(e) continue content = article.text if content: title = article.title date = article.publish_date class_2 = li.find('div.timeline-content > a').text() # print(title) cursor = conn.cursor() sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 8)) cursor.execute(sql, ('Malaysia', domain, class_1, class_2, title, date, content, url)) conn.commit() cursor.close() conn.close() now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "DONE")
def parseArticle(articles: ResultSet, host: str, src_ele: str, summary_ele: str, date_ele: str, url_ele: str): global parse_done global config_en global articles_count articles_count += len(articles) for a in articles: src = a.find(src_ele) summary = a.find(summary_ele) date = a.find(date_ele) if src is None: src = host else: src = src.text if summary is None: summary = a.find('description') # fallback if summary is not None: summary = summary.text url = a.find(url_ele) if url is not None: url = url.text.strip() else: url = '' if url != '': article = Article(url, config=config_en) if date is not None: try: date = parse(date.text) except: date = None try: article.download() article.parse() except Exception as ex: log(f'{ex}, url is "{url}"') finally: if article.publish_date is datetime and date is None: date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S') insert_db((src, date, article.title, summary, article.text, article.url)) parse_done += 1
def crawl_archive(): """ 爬取过去一周的所有新闻 """ conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean') for c in categories: for i in range(1, MAX_PAGE + 1): params['tag'] = c params['pgno'] = str(i) page_url = base_url + urlencode(params) try: doc = pq(page_url, headers=headers, verify=False) except requests.exceptions.ConnectionError as e: print(e) doc = pq(page_url, headers=headers, verify=False) ul = doc('ul.timeline > li') for li in ul.items(): url = li.find('h2 a').attr('href') article = Article(url) try: article.download() article.parse() except ArticleException as e: print(e) continue title = article.title date = article.publish_date content = article.text class_2 = li.find('div.timeline-content > a').text() if content: print(title) cursor = conn.cursor() sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 9)) cursor.execute(sql, ('Malaysia', domain, c, class_2, None, title, date, content, url)) conn.commit() cursor.close() conn.close()
''' Created on Feb 28, 2015 @author: hoavu ''' from newspaper.article import Article url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html' article = Article(url) article.download() article.parse() #print(article.html) print(article.text)
def fetch_story(request): if request.method == 'GET': # List to store all the parsed RSS entries. story_list = [] # Get Source Object from 'item_id' passed through Request source_id = request.GET.get('item_id') if source_id is None: # If none, Return to sources list return HttpResponseRedirect('/sources_list/') # Get sourcing object try: rss_obj = Sourcing.objects.get(id=source_id) except Sourcing.DoesNotExist: messages.info(request, 'Source Does Not Exist, Please try another one.') return HttpResponseRedirect('/sources_list/') # Parse the RSS URL and get the data feed_data = feedparser.parse(rss_obj.rss_url) # Detects if the Url is not well formed RSS if feed_data.bozo == 1: url_error = { 'Possible Wrong URL. Click here to go back to Sources page.' } return render_to_response('fetch_story.html', { 'url_error': url_error, 'user': request.user }) else: for data in feed_data.get('entries'): story_url = data.get('link') # If RSS is Empty return Story listing page if story_url is None: rss_error = { 'Either RSS is empty or RSS is broken. Click here to go back to Story Listing page' } return render_to_response('fetch_story.html', { 'rss_error': rss_error, 'user': request.user }) # Use newspaper library to download the article article = Article(story_url) try: article.download() except ArticleException: logger.debug("Article Download exception in : %s" % story_url) # Try to Parse Article try: article.parse() except ArticleException: logger.debug("Exception in article parse") article_instance = article # if Datetime is none, assign current datetime if article_instance.publish_date is None: if data.get('published') is None: article_instance.publish_date = datetime.now( ).strftime('%Y-%m-%d %H:%M:%S') else: article_instance.publish_date = datetime.strptime( data.get('published'), '%a, %d %b %Y %H:%M:%S GMT').strftime( '%Y-%m-%d %H:%M:%S') # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S') elif not isinstance(article_instance.publish_date, datetime): article_instance.publish_date = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S') # if Body is empty, assign dummy Text if article_instance.text is '': article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \ Click the Story title to visit the Story page." try: # Check if story exist Stories.objects.select_related('source').get(url=story_url) except Stories.DoesNotExist: story = Stories(title=article_instance.title, source=rss_obj, pub_date=article_instance.publish_date, body_text=article_instance.text, url=article_instance.url) story.save() # Add each downloaded article details to Story_list and pass to HTML template. story_list += [article_instance] return render_to_response('fetch_story.html', { 'data': story_list, 'rss_id': rss_obj, 'user': request.user }) else: return HttpResponseRedirect('/sources_list/')
def handle(self, *args, **options): source_obj = Sourcing.objects.all() stories_list = list(Stories.objects.values_list('url', flat=True)) # To store time and data iterated count not_rss_url = 0 fetched_story_count = 0 existing_story_count = len(stories_list) download_exception = 0 parsing_exception = 0 broken_rss_list = 0 print("""\n\n ------------------------Started fetching Url's:------------------------ \n """) start_time = datetime.now() sources = tqdm(source_obj) for list_item in sources: # Sources Progress bar sources.set_description('Source Completed ') # Parse data from Rss Url feed_data = feedparser.parse(list_item.rss_url) # Detects if the Url is not well formed RSS if feed_data.bozo == 1: logger.debug("Not a RSS url : %s" % list_item.rss_url) not_rss_url += 1 else: # Stories progess bar using tqdm story_entries = tqdm(feed_data.get('entries')) """ # This will iterate through each story url # If story url is already in list fetched from DB # It will not fetch for those URL. # Else: It will download the story and save to Stories DB """ for data in story_entries: # Stories Progress bar Title story_entries.set_description('Stories Completed ') # Get story Url from story_entries list story_url = data.get('link') # If RSS is Empty return to Story listing page if story_url is None: logger.debug("No feed data in RSS URL: %s" % list_item.rss_url) broken_rss_list += 1 else: # If story does not exist, It'll download and save it in database if story_url in stories_list: stories_list.remove(story_url) else: # Use Newspaper Library's article = Article(story_url) # Use newspaper library to download the article try: article.download() except ArticleException: logger.debug("Article Download exception in : %s" % story_url) download_exception += 1 # Parse Article try: article.parse() except ArticleException: logger.debug("Article parse exception in : %s" % story_url) parsing_exception += 1 article_instance = article # if Datetime is none or not a Datetime, assign current datetime if article_instance.publish_date is None: if data.get('published') is None: article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') else: article_instance.publish_date = datetime.strptime( data.get('published'), '%a, %d %b %Y %H:%M:%S GMT' ).strftime('%Y-%m-%d %H:%M:%S') elif not isinstance(article_instance.publish_date, datetime): article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # if Body is empty, assign dummy Text if article_instance.text is '': article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \ Click the Story title to visit the Story page." # Save story. story = Stories( title=article_instance.title, source=list_item, pub_date=article_instance.publish_date, body_text=article_instance.text, url=article_instance.url ) story.save() fetched_story_count += 1 stop_time = datetime.now() execution_time = stop_time - start_time final_count = len(Stories.objects.values_list('url', flat=True)) print(""" ------------------------Finished fetching Url's:------------------------ Final Result: No of Existing Stories : {0} No of New Stories Fetched : {1} No of wrong Rss Url's : {2} No of Broken or Empty Rss Url's : {3} No of Stories not Downloaded : {4} No of Stories not Parsed : {5} ------------------------------------------------- Total Stories : {6} ------------------------------------------------- Process Execution time : {7} ------------------------------------------------------------------------ """.format(existing_story_count, fetched_story_count, not_rss_url, broken_rss_list, download_exception, parsing_exception, final_count, execution_time))
def crawl_today(): """ 用于每日更新 """ now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "BEGIN", flush=True) while True: try: conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean') except OperationalError as e: print(e) time.sleep(3) else: break for key in class_.keys(): class_1 = class_[key] # print(class_1) page = 1 while page < 3: page_url = base_url + key + '&max_page=' + str(page) # print(page_url) failure = 0 while failure < 3: try: doc = pq(page_url, headers=headers, verify=False) except Exception as e: failure += 1 print(e) else: break else: continue a_list = doc('div.w3-justify a') for a in a_list.items(): news_url = 'http://www.bernama.com/en/' + a.attr('href') article = Article(news_url) try: article.download() article.parse() except ArticleException as e: print(e) continue content = pattern.sub('', article.text).replace('\n', '') if content: url = article.url title = article.title try: date = '-'.join( pattern.findall(article.html)[0].split('/')[::-1]) except: date = '' # print(title, date, content, sep='\n') cursor = conn.cursor() sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join( ['%s'] * 8)) cursor.execute(sql, ('Malaysia', domain, class_1, None, title, date, content, url)) conn.commit() cursor.close() if len(a_list) < 7: break page = page + 1 conn.close() now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('\n', domain, now, "DONE")
''' import requests import urllib.parse import nltk from newspaper.article import Article from crawlerApp.utils import get_cosine, text_to_vector, normalize_text '''get stop words first ''' with open ("stopwords_en.txt", "r") as myfile: stopwords=myfile.read().replace('\n', '') normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html' article1 = Article(normalized_url) article1.download() """ 0.62 is good threshold""" normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512' article2 = Article(normalized_url2) article2.download() print("download finished") article1.parse() string1 = article1.text article2.parse() string2 = article2.text normalised_string1 = normalize_text(string1)
def getArticleTitleText(url): article = Article(url) article.download() article.html article.parse() return [article.title, article.text.encode('utf-8')]
@author: hoavu ''' import requests import urllib.parse import nltk from newspaper.article import Article from crawlerApp.utils import get_cosine, text_to_vector, normalize_text '''get stop words first ''' with open("stopwords_en.txt", "r") as myfile: stopwords = myfile.read().replace('\n', '') normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html' article1 = Article(normalized_url) article1.download() """ 0.62 is good threshold""" normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512' article2 = Article(normalized_url2) article2.download() print("download finished") article1.parse() string1 = article1.text article2.parse() string2 = article2.text normalised_string1 = normalize_text(string1) normalised_string2 = normalize_text(string2)
''' Created on Feb 28, 2015 @author: hoavu ''' from newspaper.article import Article url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html' article = Article(url) article.download() article.parse() #print(article.html) print(article.text)
def download_and_parse(article: Article): try: article.download() article.parse() except newspaper.article.ArticleException: pass
from newspaper.article import Article import newspaper # url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/' # # article = Article(url) # # article.parse() # article.authors # article.text # cnn_paper = newspaper.build('http://cnn.com') url = 'http://news.163.com/17/0312/10/CFAP3Q9G000189FH.html' a = Article(url, language='zh') # Chinese a.download() a.parse() print(a.keywords) print("===============") print(a.title) print("===============") print(a.authors) print("===============") print(a.text[:150]) # filter_regex = re.compile(r'[^a-zA-Z0-9\ ]') # title_text_h1 = "我与总书记议国是:建设社会稳定长治久安新AA边疆,总书记 社会稳定 全国人大代表00000ddd" # filter_title_text_h1 = filter_regex.sub('', title_text_h1).lower() # print(filter_title_text_h1)