def requests(cls, url, headers): for i in range(3): try: timeSleepOne() res = requests.get(url, headers=headers) res.encoding = 'utf-8' timeSleepRandomly() soup = BeautifulSoup(res.text, 'html.parser') newsContent = [ textMiningRegex.discardSpace( textMiningRegex.replaceEscapeAlphabet(row.text)) for row in soup.select_one("#article_body").select("p") if row.text != "" ] videoLinkInContent = None # 內文本身沒有影片 break except requests.exceptions.ConnectionError as e: print(url, "發生問題。", e) print() timeSleepRandomly() timeSleepTwo() newsContent = None videoLinkInContent = None return videoLinkInContent, newsContent
def requests(cls, url, headers): for i in range(3): try: timeSleepOne() res = requests.get(url, headers=headers) res.encoding = 'utf-8' timeSleepRandomly() soup = BeautifulSoup(res.text, 'html.parser') newsContent = [ textMiningRegex.discardSpace( textMiningRegex.replaceEscapeAlphabet(row.text)) for row in soup.find("article").stripped_strings if row != "" and not "googletag.cmd.push" in row and not "function" in row ] videoLinkInContent = None # 內文本身沒有影片 break except requests.exceptions.ConnectionError as e: print(url, "發生問題。", e) print() timeSleepRandomly() timeSleepTwo() newsContent = None videoLinkInContent = None return videoLinkInContent, newsContent
def requests(cls, url, headers): for i in range(3): try: timeSleepOne() if "https://ent.ltn.com.tw/news/" in url: videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT( url, headers) break res = requests.get(url, headers=headers) res.encoding = 'utf-8' timeSleepRandomly() soup = BeautifulSoup( res.text, 'lxml' ) # html.parser不夠力 https://ec.ltn.com.tw/article/paper/1295417 抓不到內容 try: newsContent = [ textMiningRegex.discardSpace( textMiningRegex.replaceEscapeAlphabet(row.text)) for row in soup.select_one(".text").select("p") if row.text != "" ] videoLinkInContent = None # 內文本身沒有影片 except AttributeError as e: # https://news.ltn.com.tw/news/consumer/paper/1284005 --> https://ent.ltn.com.tw/news/paper/1284005 print("error code:", e, url) videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT( url, headers) break except requests.exceptions.ConnectionError as e: print(url, "發生問題。", e) print() timeSleepRandomly() timeSleepTwo() newsContent = None videoLinkInContent = None return videoLinkInContent, newsContent
def requests(cls, url, headers): for i in range(3): try: timeSleepOne() res = requests.get(url, headers=headers) res.encoding = 'utf-8' timeSleepRandomly() soup = BeautifulSoup(res.text, 'html.parser') try: newsContent = [ textMiningRegex.discardSpace( textMiningRegex.replaceEscapeAlphabet(row.text)) for row in soup.select_one(".ndArticle_margin").select( "p") if row.text != "" ] videoLinkInContent = None # 內文本身沒有影片 except AttributeError as e: # AttributeError: 'NoneType' object has no attribute 'select' soupStr = str(soup) if "<br> \xa0</p>" in soupStr: # "<br> \xa0</p>" 不需要變成 "<br> \\xa0</p>" """ sqlalchemy.exc.OperationalError: (pymssql.OperationalError) (8152, b'String or binary data would be truncated.DB-Lib error message 8152, severity 16:\nGeneral SQL Server error: Check messages from the SQL Server\n') [SQL: INSERT INTO selected_news_with_tfidf ([news_title_Id], [series_Id], [publisher_Id], news_content, video_link_in_content) VALUES (%(news_title_Id)s, %(series_Id)s, %(publisher_Id)s, %(news_content)s, %(video_link_in_content)s)] [parameters: {'news_title_Id': '201912252', 'series_Id': UUID('9abd7eae-c361-496c-b10c-ae9fcf7be8bb'), 'publisher_Id': '5', 'news_content': '[\'<p> 今年農曆年節時間較早,家電採購需求較以往提早出現買氣,瞄準年前有汰換家中家電的需求,大同3C福利品特賣會特於12月底開跑,一路至明年1月初,提供消費者年前採購好選擇。<br> <br> 12月26日起至2020年1月8日止,全台各地共舉辦20場大同3C福利品特賣會,大小家電可在此一次 ... (3925 characters truncated) ... aws.com/ap-ne-1-prod/public/FLCZDN5FBRQBN6E6E3S7RP7IW4.jpg","version":"0.10.3","width":640},{"_id":"IO25XHAIRJE3FCUWV7YTXI66CY","type":"raw_html",\']', 'video_link_in_content': None}] (Background on this error at: http://sqlalche.me/e/e3q8) """ # https://tw.appledaily.com/property/20191226/WCUY7RP45D2V45RLRN3RULU2QU/ tmpStr = soupStr.split( """<script type="application/javascript">window.Fusion=""" )[1].split("Fusion.globalContent=")[1].split( '"content":"')[1].split("<br> \xa0</p>")[0] newsContent = [ row for row in BeautifulSoup( tmpStr, "html.parser").text.split(" ") if row != "" ] else: # https://tw.appledaily.com/gadget/20190927/IFU7ML7HXNAL2GHDNKOZULDNOU/ tmpStr = soupStr.split( """<script type="application/javascript">window.Fusion=""" )[1].split("Fusion.globalContent=")[1].split( '"content":"')[1].split("更多「")[0] newsContent = [ row for row in tmpStr.split("<br /> <br />") if row != "" ] if len("".join(newsContent)) >= 3500: # elif '<br /> "' in soupStr: # https://tw.appledaily.com/gadget/20191029/KSU3NPGRYURXTCI3COIUE6KMNM/ print( f"appledaily news content exceeds 3500: {url}") tmpStr = soupStr.split( """<script type="application/javascript">window.Fusion=""" )[1].split("Fusion.globalContent=")[1].split( '"content":"')[1].split('<br /> "}')[0] newsContent = [ row for row in tmpStr.split("<br /> <br />") if row != "" ] videoLinkInContent = None # 內文本身沒有影片 break except requests.exceptions.ConnectionError as e: print(url, "發生問題。", e) print() timeSleepRandomly() timeSleepTwo() newsContent = None videoLinkInContent = None return videoLinkInContent, newsContent
def requests(cls, url, headers): for i in range(3): try: timeSleepOne() res = requests.get(url, headers=headers) res.encoding = 'utf-8' timeSleepRandomly() soup = BeautifulSoup(res.text, 'html.parser') newsContent = [ textMiningRegex.discardSpace( textMiningRegex.replaceEscapeAlphabet(row.text)) for row in soup.select_one("#story_body_content").select("p") if row.text != "" ] videoLinkInContent = None # 內文本身沒有影片 break except AttributeError as e: try: # 20200207 udn網頁改版 newsContent = [ textMiningRegex.discardSpace( textMiningRegex.replaceEscapeAlphabet(row.text)) for row in soup.find("article", { "class": "article-content" }).find_all("p") if row.text != "" ] except AttributeError as e: # 網頁拜訪若是404,html長的如下樣子。 ''' response404 = """<html> <head> <script> var d = new Date(); d.setTime(d.getTime() + (300*1000)); var expires = "expires="+ d.toUTCString(); document.cookie = "burl=my-test-page01;" + expires + ";path=/"; </script> <!-- Google Tag Manager --> <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-5CMHR66');</script> <!-- End Google Tag Manager --><script> (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); </script> <!-- #Location: /inc/meta/trace_ga --> </head> <body> <!-- Google Tag Manager (noscript) --> <noscript><iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-5CMHR66" style="display:none;visibility:hidden" width="0"></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <script> window.location="/news/e404?nver"; </script> </body> </html>""" ''' if searchWordTrueOrFalse( "404", str(soup.select_one("body").select_one("script")) ): #'<script>\n window.location="/news/e404?nver";\n </script>' # https://udn.com/news/story/7238/3600804 print(url, "發生問題:404!") newsContent = "404_None" else: # 不知名情況查看 print(soup) newsContent = "404_None" raise videoLinkInContent = None # 內文本身沒有影片 break except requests.exceptions.ConnectionError as e: print(url, "發生問題。", e) print() timeSleepRandomly() timeSleepTwo() newsContent = None videoLinkInContent = None return videoLinkInContent, newsContent