def get_top(self, url, category): # 进行http请求,获取response r = requests.get(url) r.encoding = 'gb18030' # 获取response的返回值 html = r.text soup = BeautifulSoup(html, 'lxml') # print(soup.prettify()) # 定义一个返回值 resultList = [] # 循环处理 for child in soup.table.children: # 定义一个map,存放地址,名称和数量 tmp = {} if isinstance(child, bs4.element.Tag): if child.span and child.a: if child.select('a["class~=list-title"]'): tmp["url"] = child.a["href"] tmp["title"] = child.a.text.strip() if child.select('span["class^=icon-"]'): span = child.select('span["class^=icon-"]')[0].string tmp["count"] = re.search("(\d+)", span).group(0) tmp["createTime"] = DateUtil.nowSplit() tmp["source"] = "baidu" tmp["category"] = category # 加入list resultList.append(tmp) # 插入mongo库 MongoUtil.saveMongoForHotSearch(resultList)
def get_top(self, url): headers = {'Content-Type': 'application/json; charset=utf-8'} # 进行http请求,获取response r = requests.get(url, headers=headers) # 获取response的返回值 resJson = r.text data = json.loads(resJson) searchJson = data.get('data') searchData = json.loads(searchJson) # 创建list[{}] resultList = [] # 定义一个总量 totalCount = 100 # 循环处理 for word in searchData.get('search_words'): tmp = {} tmp["url"] = word.get("link") tmp["title"] = word.get("q") totalCount = totalCount - 1 tmp["count"] = totalCount tmp["createTime"] = DateUtil.nowSplit() tmp["source"] = "toutiao" tmp["category"] = "hot_search" # 加入列表 resultList.append(tmp) # 插入mongo库 MongoUtil.saveMongoForHotSearch(resultList)
def get_top(self, url): # 进行http请求,获取response r = requests.get(url) # 获取response的返回值 html = r.text soup = BeautifulSoup(html, 'lxml') # print(soup.prettify()) # 定义一个返回值 resultList = [] # 循环处理 for child in soup.tbody.children: # 定义一个map,存放地址,名称和数量 tmp = {} if isinstance(child, bs4.element.Tag): if child.span and child.a: if child.select('a["href_to"]'): tmp["url"] = "http://s.weibo.com" + child.a["href_to"] else: tmp["url"] = "http://s.weibo.com" + child.a["href"] tmp["title"] = child.a.text.strip() tmp["count"] = child.span.string tmp["createTime"] = DateUtil.nowSplit() tmp["source"] = "weibo" tmp["category"] = "hot_search" # 加入list resultList.append(tmp) # 插入mongo库 MongoUtil.saveMongoForHotSearch(resultList)
def get_top(self, url): # 进行http请求,获取response response = requests.get(url) # 获取response的返回值 dataJson = json.loads(response.text) code = dataJson.get("code") if 0 == code: keywords = dataJson.get("keywords") # 定义一个返回值 resultList = [] # keywords结果为json,进行json处理 count = 100 for item in keywords: tmp = {} tmp["url"] = "" tmp["title"] = item.get("name") tmp["count"] = count count = count - 1 tmp["createTime"] = DateUtil.nowSplit() tmp["source"] = "yidian" tmp["category"] = "hot_search" # 加入list resultList.append(tmp) # 插入mongo库 MongoUtil.saveMongoForHotSearch(resultList)
def main(): try: # 创建对象 realtimehot = Top() # 进行查询 realtimehot.get_top("https://www.ixigua.com/hot_words/") print("xigua hot search start at time:%s" % DateUtil.nowSplit()) except Exception as e: print(e)
def main(): try: # 创建对象 realtimehot = Top() # 进行查询 realtimehot.get_top("http://s.weibo.com/top/summary?cate=realtimehot") print("weibo hot search start at time:%s" % DateUtil.nowSplit()) except Exception as e: print(e)
def main(): try: # 创建对象 realtimehot = Top() # 进行查询 realtimehot.get_top( "https://is.snssdk.com/2/wap/search/extra/hot_word_list/?use_wk=1&hide_bar=1&hide_status_bar=1&background_colorkey=3&disable_web_progressView=1&enable_jump=1&is_new_ui=1&source=title&iid=56902458375&device_id=58205562141&channel=oppo-cpa&aid=13&app_name=news_article&version_code=705&version_name=7.0.5&device_platform=android&abflag=3&device_type=PBEM00&device_brand=OPPO&language=zh&os_api=27&os_version=8.1.0&openudid=7284731287f985db&manifest_version_code=705&resolution=1080*2340&dpi=480&update_version_code=70515&_rticket=1546829616083&plugin=26958&fp=crT_cW4_FrGtFlwOLlU1F2KIFzKe&format=json" ) print("toutiao hot search start at time:%s" % DateUtil.nowSplit()) except Exception as e: print(e)
def main(): try: # 创建对象 realtimehot = Top() # 进行查询 realtimehot.get_top( "https://www.yidianzixun.com/home/q/hot_search_keywords?appid=web_yidian&_=%s" % DateUtil.getNowTimeMillisecond()) print("yidian hot search start at time:%s" % DateUtil.nowSplit()) except Exception as e: print(e)
def main(): try: # 创建对象 realtimehot = Top() # 实时热点 realtimehot.get_top("http://top.baidu.com/buzz?b=1&fr=topbuzz_b11", "hot_realtime") # 今日热点 realtimehot.get_top( "http://top.baidu.com/buzz?b=341&c=513&fr=topbuzz_b341_c513", "hot_today") # 民生热点 realtimehot.get_top( "http://top.baidu.com/buzz?b=342&c=513&fr=topbuzz_b341_c513", "hot_livelihood") # 娱乐热点 realtimehot.get_top( "http://top.baidu.com/buzz?b=344&c=513&fr=topbuzz_b342_c513", "hot_entertainment") print("baidu hot search start at time:%s" % DateUtil.nowSplit()) except Exception as e: print(e)