예제 #1
0
    def get_top(self, url, category):
        # 进行http请求,获取response
        r = requests.get(url)
        r.encoding = 'gb18030'
        # 获取response的返回值
        html = r.text
        soup = BeautifulSoup(html, 'lxml')
        # print(soup.prettify())
        # 定义一个返回值
        resultList = []
        # 循环处理
        for child in soup.table.children:
            # 定义一个map,存放地址,名称和数量
            tmp = {}
            if isinstance(child, bs4.element.Tag):
                if child.span and child.a:
                    if child.select('a["class~=list-title"]'):
                        tmp["url"] = child.a["href"]
                    tmp["title"] = child.a.text.strip()
                    if child.select('span["class^=icon-"]'):
                        span = child.select('span["class^=icon-"]')[0].string
                        tmp["count"] = re.search("(\d+)", span).group(0)
                    tmp["createTime"] = DateUtil.nowSplit()
                    tmp["source"] = "baidu"
                    tmp["category"] = category
                    # 加入list
                    resultList.append(tmp)

        # 插入mongo库
        MongoUtil.saveMongoForHotSearch(resultList)
예제 #2
0
    def get_top(self, url):
        headers = {'Content-Type': 'application/json; charset=utf-8'}
        # 进行http请求,获取response
        r = requests.get(url, headers=headers)
        # 获取response的返回值
        resJson = r.text
        data = json.loads(resJson)
        searchJson = data.get('data')
        searchData = json.loads(searchJson)

        # 创建list[{}]
        resultList = []
        # 定义一个总量
        totalCount = 100
        # 循环处理
        for word in searchData.get('search_words'):
            tmp = {}
            tmp["url"] = word.get("link")
            tmp["title"] = word.get("q")
            totalCount = totalCount - 1
            tmp["count"] = totalCount
            tmp["createTime"] = DateUtil.nowSplit()
            tmp["source"] = "toutiao"
            tmp["category"] = "hot_search"
            # 加入列表
            resultList.append(tmp)
        # 插入mongo库
        MongoUtil.saveMongoForHotSearch(resultList)
예제 #3
0
    def get_top(self, url):
        # 进行http请求,获取response
        r = requests.get(url)
        # 获取response的返回值
        html = r.text
        soup = BeautifulSoup(html, 'lxml')
        # print(soup.prettify())
        # 定义一个返回值
        resultList = []
        # 循环处理
        for child in soup.tbody.children:
            # 定义一个map,存放地址,名称和数量
            tmp = {}
            if isinstance(child, bs4.element.Tag):
                if child.span and child.a:
                    if child.select('a["href_to"]'):
                        tmp["url"] = "http://s.weibo.com" + child.a["href_to"]
                    else:
                        tmp["url"] = "http://s.weibo.com" + child.a["href"]
                    tmp["title"] = child.a.text.strip()
                    tmp["count"] = child.span.string
                    tmp["createTime"] = DateUtil.nowSplit()
                    tmp["source"] = "weibo"
                    tmp["category"] = "hot_search"
                    # 加入list
                    resultList.append(tmp)

        # 插入mongo库
        MongoUtil.saveMongoForHotSearch(resultList)
예제 #4
0
 def get_top(self, url):
     # 进行http请求,获取response
     response = requests.get(url)
     # 获取response的返回值
     dataJson = json.loads(response.text)
     code = dataJson.get("code")
     if 0 == code:
         keywords = dataJson.get("keywords")
         # 定义一个返回值
         resultList = []
         # keywords结果为json,进行json处理
         count = 100
         for item in keywords:
             tmp = {}
             tmp["url"] = ""
             tmp["title"] = item.get("name")
             tmp["count"] = count
             count = count - 1
             tmp["createTime"] = DateUtil.nowSplit()
             tmp["source"] = "yidian"
             tmp["category"] = "hot_search"
             # 加入list
             resultList.append(tmp)
         # 插入mongo库
         MongoUtil.saveMongoForHotSearch(resultList)
예제 #5
0
def main():
    try:
        # 创建对象
        realtimehot = Top()
        # 进行查询
        realtimehot.get_top("https://www.ixigua.com/hot_words/")
        print("xigua hot search start at time:%s" % DateUtil.nowSplit())
    except Exception as e:
        print(e)
예제 #6
0
def main():
    try:

        # 创建对象
        realtimehot = Top()
        # 进行查询
        realtimehot.get_top("http://s.weibo.com/top/summary?cate=realtimehot")
        print("weibo hot search start at time:%s" % DateUtil.nowSplit())
    except Exception as e:
        print(e)
예제 #7
0
def main():
    try:
        # 创建对象
        realtimehot = Top()
        # 进行查询
        realtimehot.get_top(
            "https://is.snssdk.com/2/wap/search/extra/hot_word_list/?use_wk=1&hide_bar=1&hide_status_bar=1&background_colorkey=3&disable_web_progressView=1&enable_jump=1&is_new_ui=1&source=title&iid=56902458375&device_id=58205562141&channel=oppo-cpa&aid=13&app_name=news_article&version_code=705&version_name=7.0.5&device_platform=android&abflag=3&device_type=PBEM00&device_brand=OPPO&language=zh&os_api=27&os_version=8.1.0&openudid=7284731287f985db&manifest_version_code=705&resolution=1080*2340&dpi=480&update_version_code=70515&_rticket=1546829616083&plugin=26958&fp=crT_cW4_FrGtFlwOLlU1F2KIFzKe&format=json"
        )
        print("toutiao hot search start at time:%s" % DateUtil.nowSplit())
    except Exception as e:
        print(e)
예제 #8
0
def main():
    try:

        # 创建对象
        realtimehot = Top()
        # 进行查询
        realtimehot.get_top(
            "https://www.yidianzixun.com/home/q/hot_search_keywords?appid=web_yidian&_=%s"
            % DateUtil.getNowTimeMillisecond())
        print("yidian hot search start at time:%s" % DateUtil.nowSplit())
    except Exception as e:
        print(e)
예제 #9
0
def main():
    try:
        # 创建对象
        realtimehot = Top()
        # 实时热点
        realtimehot.get_top("http://top.baidu.com/buzz?b=1&fr=topbuzz_b11",
                            "hot_realtime")
        # 今日热点
        realtimehot.get_top(
            "http://top.baidu.com/buzz?b=341&c=513&fr=topbuzz_b341_c513",
            "hot_today")
        # 民生热点
        realtimehot.get_top(
            "http://top.baidu.com/buzz?b=342&c=513&fr=topbuzz_b341_c513",
            "hot_livelihood")
        # 娱乐热点
        realtimehot.get_top(
            "http://top.baidu.com/buzz?b=344&c=513&fr=topbuzz_b342_c513",
            "hot_entertainment")
        print("baidu hot search start at time:%s" % DateUtil.nowSplit())
    except Exception as e:
        print(e)