def saveDB(self, data, source): try: if len(data['content']) > 500: data1 = urllib.parse.urlencode({ 'source': source, 'sourceUrl': data['sourceUrl'], 'title': data['title'], 'authorName': data['authorName'], 'releaseTime': data['releaseTime'], 'content': data['content'], 'img': data['img'], 'jobId': data['jobId'] }) data2 = data1.encode('utf-8') re = urllib.request.urlopen(url=self.post_url, data=data2) re.read().decode('utf-8') else: pass except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'SavePostSql\t', str(e))
def saveMySql(self, data, tableName=None, host='localhost', port=3306, user='******', password='******', db='crawler', charset='utf8', *args): # mysql 入库 conn = pymysql.Connect(host=host, user=user, password=password, port=port, database=db, charset=charset) cursor = conn.cursor() try: sql = 'insert into news(sourceUrl,title,authorName,releaseTime,content,img) VALUES (%s,%s,%s,%s,%s,%s)' cursor.execute(sql, [ data['sourceUrl'], data['title'], data['authorName'], data['releaseTime'], data['content'], data['img'] ]) conn.commit() except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'SaveMySql\t', str(e))
def mainNewsBreak(self): n = NewsFeeds() s = SaveSqlDb() nb = NewsBreak() url_nb = 'http://api.particlenews.com/Website/channel/news-list-for-best-channel?cstart=0&infinite=true&refresh=1&epoch=5&distribution=newsbreak&platform=1&cv=4.7.3&cend=10&appid=newsbreak&weather=true&fields=docid&fields=date&fields=image&fields=image_urls&fields=like&fields=source&fields=title&fields=url&fields=comment_count&fields=fb_share_total&fields=coach_mark_text&fields=up&fields=down&fields=summary&fields=favicon_id&fields=dominant_image&fields=contextMeta&fields=video_urls&fields=viewType&push_refresh=0&modularize=true&ts=2019-04-07+18%3A14%3A01+%2B0800&version=020025&net=wifi' docId = nb.parsingPost(url=url_nb) get_url = 'http://api.particlenews.com/Website/contents/content?related_docs=false&cv=4.7.3' \ '&docid=' + docId + \ '&appid=newsbreak&bottom_channels=false&distribution=newsbreak&platform=1&version=020025&net=wifi' news_list = nb.parsingGet(url=get_url) path = '/data/crawler' pic_path = '/newsbreak/picture/' number = 1 for url in news_list: if url not in self.have_met: self.have_met.add(url) data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('NB_detail_url\t', url) print('NB_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=2) else: s.saveMySql(data=data) else: pass
def mainSmartNews(self): n = NewsFeeds() s = SaveSqlDb() sm = SmartNews() news_list = sm.smartNews() path = '/data/crawler' pic_path = '/smartNews/picture/' number = 1 for new in news_list: if new not in self.have_met: self.have_met.add(new) data = n.parsingUrl(url=new, downloadPath=path, picPath=pic_path) if data is None: pass else: print('SM_detail_url\t', new) print('SM_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=5) else: s.saveMySql(data=data) else: pass
def mainBuzzFeed(self): n = NewsFeeds() s = SaveSqlDb() bf = BuzzFeed() top_urls = bf.parsingTopUrl() news_urls = bf.parsingNewsUrl() urls_list = top_urls + news_urls path = '/data/crawler' pic_path = '/buzzfeed/picture/' number = 1 for url in urls_list: if url not in self.have_met: self.have_met.add(url) data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('BF_detail_url\t', url) print('BF_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=3) else: s.saveMySql(data=data) else: pass
def mainGoogleNews(self): n = NewsFeeds() s = SaveSqlDb() gn = GoogleNews() news_list = gn.googleNews() path = '/data/crawler' pic_path = '/googleNews/picture/' number = 1 for new in news_list: url = new.link.text if url not in self.have_met: data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('GN_detail_url\t', url) print('GN_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=4) else: s.saveMySql(data=data) else: pass
def parsingShopping(self): try: pg = 1 urls_shopping = [] while pg < 6: shopping_url = 'https://www.buzzfeed.com/us/feedpage/feed/shopping?page='+str(pg)+'&page_name=shopping' res = requests.get(url=shopping_url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) urls_list = html.xpath('//a[@class="js-card__link link-gray"]/@href') urls_shopping += urls_list pg += 1 return urls_shopping except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'BuzzFeedParsingShoppingUrl]\t', str(e))
def parsingNewsUrl(self): try: pg = 1 urls_news = [] while pg < 6: new_url = 'https://www.buzzfeednews.com/us/feed/home?page=' + str(pg) + '&flexpro_enabled=1' res = requests.get(url=new_url, headers=self.headers, cookies=self.cookies).text html = etree.HTML(res) url_list = html.xpath('//div[@class="news-feed grid-layout-main"]//article/a/@href') urls_news += url_list pg += 1 return urls_news except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'BuzzFeedParsingNewsUrl]\t', str(e))
def parsingGet(self, url): try: res = requests.get(url, headers=self.headers, cookies=self.cookies).text data = json.loads(res) result = data["documents"] urlList = [] for i in range(len(result)): sourceUrl = result[i]['url'] urlList.append(sourceUrl) return urlList except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'NewsBreakGet\t', str(e))
def sendRequest(self, url): try: res = requests.post(url=url, headers=self.headers, cookies=self.cookies).text data = json.loads(res) item = data['data']['items'] # 分析获取的json urlList = [] for i in range(len(item)): # 数据来源 sourceUrl = item[i]['article_url'] urlList.append(sourceUrl) return urlList except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'TopBuzzSendRequest\t', str(e))
def parsingTopUrl(self): try: pg = 1 urls_tops = [] while pg <6: top_url = 'https://www.buzzfeed.com/site-component/v1/en-us/morebuzz?page='+str(pg)+'&page_size=15&image_crop=wide' res = requests.get(url=top_url, headers=self.headers, cookies=self.cookies).text data = json.loads(res) results = data['results'] for i in range(len(results)): res_url = results[i]['url'] if res_url is None or res_url == '': pass urls_tops.append(res_url) pg += 1 return urls_tops except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'BuzzFeedParsingTopUrl]\t', str(e))
def parsingPost(self, url): try: res = requests.post(url=url, headers=self.headers, data=self.data, cookies=self.cookies).text data = json.loads(res) result = data['result'] docList = [] for i in result: try: docList.append(i['docid']) except: pass return ','.join(i for i in docList) except BaseException as e: NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'NewsBreakPost', str(e))
def mainTopBuzz(self): n = NewsFeeds() s = SaveSqlDb() tb = TopBuzz() # 访问时间 t = time.time() #正则匹配时间戳小数位 result = re.findall('.\d*', str(t)) sign = tb.hash_code(result[1][1:]) timestamp = result[0] url_tb = 'https://i16-tb.isnssdk.com/api/844/stream?session_impr_id=0&tab=General&count=20&min_behot_time=1.554174097999E9&loc_mode=7&lac=4314&cid=6439033' \ '&sign='+sign+ \ '×tamp='+timestamp+ \ '&logo=topbuzz&gender=0&bv_is_auto_play=0&youtube=0&manifest_version_code=844&app_version=8.4.4&iid=6672646082571388678&gaid=54b268f4-52c2-470c-a815-abd1d00acce9&original_channel=gp&channel=gp&fp=TlTrJzK1FYsqFYs5PlU1LMGSL2Xr&device_type=MIX+2&language=en&app_version_minor=8.4.4.01&resolution=2030*1080&openudid=ab50caa43e995042&update_version_code=8440&sys_language=zh&sys_region=cn&os_api=26&tz_name=Asia%2FShanghai&tz_offset=28800&dpi=440&brand=Xiaomi&ac=WIFI&device_id=6672637176796333574&os=android&os_version=8.0.0&version_code=844&hevc_supported=1&device_brand=Xiaomi&device_platform=android&sim_region=cn®ion=us&aid=1106&ui_language=en' news_list = tb.sendRequest(url=url_tb) path = '/data/crawler' pic_path = '/topbuzz/picture/' number = 1 for url in news_list: if url not in self.have_met: self.have_met.add(url) data = n.parsingUrl(url=url, downloadPath=path, picPath=pic_path) if data is None: pass else: print('TB_detail_url\t', url) print('TB_number\t', number) number += 1 if data['releaseTime'] is None or data['releaseTime'] == '': data['releaseTime'] = str(self.point_time) if self.post_DB: s.saveDB(data=data, source=1) else: s.saveMySql(data=data) else: pass