Exemplo n.º 1
0
    def run(self):
        super().run()
        headers = HEADERS.copy()
        headers['Upgrade-Insecure-Requests'] = '1'
        headers['Referer'] = 'https://www.douban.com/group/explore'
        headers['Host'] = 'www.douban.com'
        res = requests.get(douban_url, headers=headers)
        res.encoding = 'utf-8'

        soup = BeautifulSoup(res.text, 'html.parser')

        _list = soup.select('div.channel-item')
        print(len(_list))
        for item in _list:
            a_el = item.select('div.bd a')[0]
            title = a_el.text.strip()
            url = a_el.get('href')

            desc = item.select('div.block p')[0].text.strip()

            hot_item = HotItem(title, url, cate=types['douban'], desc=desc)
            self.arr.append(hot_item)

        hot_collection.delete_many({'cate': types['douban']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 2
0
    def run(self):
        super().run()
        headers = HEADERS.copy()
        headers['Upgrade-Insecure-Requests'] = '1'
        headers['Referer'] = 'https://36kr.com/'
        headers['Host'] = '36kr.com'
        res = requests.get(url_36kr, headers=HEADERS)
        res.encoding = 'utf-8'

        soup = BeautifulSoup(res.text, 'html.parser')
        _list = soup.select('div.kr-home-flow-item')
        for item in _list:
            a_tag = item.select('a.article-item-title')
            if a_tag:
                title = a_tag[0].text.strip()
                url = 'https://36kr.com' + a_tag[0].get('href')
                desc = item.select(
                    'a.article-item-description')[0].text.strip()

                hot_item = HotItem(title, url, cate=types['36kr'], desc=desc)
                self.arr.append(hot_item)
            else:
                continue
        hot_collection.delete_many({'cate': types['36kr']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 3
0
    def run(self):
        super().run()
        res = requests.get(WEIBOT_URL, headers=HEADERS)
        res.encoding = 'utf-8'

        soup = BeautifulSoup(res.text, 'html.parser')
        _list = soup.select('td.td-02 a')
        for item in _list:
            title = item.text.strip()
            url = 'https://s.weibo.com{}'.format(item.get('href'))
            hot_item = HotItem(title, url, cate=types['weibo'])
            self.arr.append(hot_item)
        hot_collection.delete_many({'cate': types['weibo']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 4
0
    def run(self):
        super().run()
        res = requests.get(V2EX_URL, headers=HEADERS)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        _list = soup.select('.box a.topic-link')
        for item in _list:
            title = item.text
            url = 'https://www.v2ex.com{}'.format(item.get('href'))
            hot_item = HotItem(title, url, cate=types['v2ex'])
            self.arr.append(hot_item)

        hot_collection.delete_many({'cate': types['v2ex']})
        hot_collection.insert_many([item.__dict__ for item in self.arr])
Exemplo n.º 5
0
    def run(self):
        super().run()
        res = requests.get(tieba_url, headers=HEADERS)
        res.encoding = 'utf-8'
        res = res.json()

        for item in res['data']['bang_topic']['topic_list']:
            hot_item = HotItem(title=item['topic_name'],
                               url=item['topic_url'],
                               cate=types['tieba'],
                               desc=item['topic_desc'])
            self.arr.append(hot_item)

        hot_collection.delete_many({'cate': types['tieba']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 6
0
    def run(self):
        super().run()
        res = requests.get(HUPU_URL, headers=HEADERS)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')

        top_list = soup.select('.bbsHotPit li')
        for item in top_list:
            a_tag = item.select('.textSpan a')[0]
            url = 'https://bbs.hupu.com/{}'.format(a_tag.get('href'))
            title = a_tag.get('title')
            title = title.replace('zt', '')
            hot_item = HotItem(title, url, cate=types['hupu'])
            self.arr.append(hot_item)
        hot_collection.delete_many({'cate': types['hupu']})
        hot_collection.insert_many([item.__dict__ for item in self.arr])
Exemplo n.º 7
0
    def run(self):
        super().run()
        headers = HEADERS.copy()
        headers['Upgrade-Insecure-Requests'] = '1'
        headers['Host'] = 'top.baidu.com'
        res = requests.get(baidu_url, headers=HEADERS)
        res.encoding = 'gbk'

        soup = BeautifulSoup(res.text, 'html.parser')
        _list = soup.select('a.list-title')
        for item in _list:
            title = item.text.strip()
            url = item.get('href')
            hot_item = HotItem(title, url, cate=types['baidu'])

            self.arr.append(hot_item)

        hot_collection.delete_many({'cate': types['baidu']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 8
0
    def run(self):
        super().run()
        res = requests.get(github_URL, headers=HEADERS)
        res.encoding = 'utf-8'

        soup = BeautifulSoup(res.text, 'html.parser')
        repository = soup.select('article.Box-row')

        for row in repository:
            title = row.select('h1.h3.lh-condensed')[0].text.strip()
            title = title.replace(' ', '').replace('\n', '')
            url = 'https://github.com' + row.select(
                'h1.h3.lh-condensed')[0].find('a').get('href')
            desc_el = row.select('p.col-9')
            desc = desc_el[0].text.strip() if desc_el else None
            hot_item = HotItem(title, url, cate=types['github'], desc=desc)
            self.arr.append(hot_item)

        hot_collection.delete_many({'cate': types['github']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 9
0
 def run(self):
     super().run()
     headers = HEADERS.copy()
     headers.update(
         referer='https://www.zhihu.com/signin?next=%2Fhot',
         cookie='tgw_l7_route=a37704a413efa26cf3f23813004f1a3b; _zap=4369dfa8-8757-4a0a-9b23-114fe13b449c; _xsrf=8ce5a6f2-6d38-4c4f-8a94-f343511a4dc8; d_c0="AMDmP0jPHRCPTnmL4dcBXTUEm7lWNYvWtO0=|1569658401"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1569658400; capsion_ticket="2|1:0|10:1569658401|14:capsion_ticket|44:YThjZTVmZmI3NDQwNGRlNGIyZTQ5NmQzYzUzZTY0MGQ=|62c39d95c612658919bf053eb0b13b70ccddcde4d6b4c534127f37c900e2411e"; l_n_c=1; r_cap_id="NTkxYTFkN2M3ODZiNDJmNTlkYTVhNzBiZjEzODIxYTA=|1569658760|3f15e311b6f99af3f1d95ed3a1ea1619ac75eef1"; cap_id="M2E1OWVmNTM5YjMwNDQ4M2JkZGIyN2IyYmUxYWYwYzA=|1569658760|be545d54fcaeacdd6084952d60d42c2a6477e9dc"; l_cap_id="OTg5NTkzMGEzZDcxNGU0ZDhjZmYyYWI0MzZjOWFmYTE=|1569658760|e4477741cdddda2bf782bbefac43bdb6dc414112"; n_c=1; z_c0=Mi4xejFiZEFRQUFBQUFBd09ZX1NNOGRFQmNBQUFCaEFsVk5rbVY4WGdBdS04a1NoY0otV0ZUQk5ydjh0a0RGb2ZjaFBn|1569658770|1b7f9d825f104d60dc719882276ec00880677c27; tshl=; tst=h; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1569659055; unlock_ticket="ABDMJM49ZggXAAAAYQJVTbkfj12-gPv33gH7u9Oq7-u4wMqZ5VMUMw=="'
     )
     res = requests.get(ZHIHU_URL,headers=headers)
     res.encoding = 'utf-8'
     soup = BeautifulSoup(res.text, 'html.parser')
     top_list = soup.select('.HotList-list .HotItem-content')
     for item in top_list:
         a_tag = item.find('a')
         url = a_tag.get('href')
         title = a_tag.get('title')
         desc_tag = item.select('p')
         desc = desc_tag[0].text if desc_tag else None
         hot_item = HotItem(title, url,cate=types['zhihu'],desc=desc)
         self.arr.append(hot_item)
     hot_collection.delete_many({'cate':types['zhihu']})
     hot_collection.insert_many([item.__dict__ for item in self.arr])
Exemplo n.º 10
0
    def run(self):
        super().run()
        headers = HEADERS.copy()
        headers['Upgrade-Insecure-Requests'] = '1'
        headers['Referer'] = 'https://www.huxiu.com/channel/107.html'
        headers['Host'] = 'www.huxiu.com'
        res = requests.get(huxiu_url,headers=headers)
        res.encoding= 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        _list = soup.select('div.article-items')
        for item in _list:
            content_el = item.find('div',class_='article-item__content')
            a_tag = content_el.select('a')[-1]

            title = a_tag.find('h5',class_='article-item__content__title').text.strip()
            url = 'https://www.huxiu.com' + a_tag.get('href')
            desc = a_tag.find('p',class_='article-item__content__intro').text.strip()

            hot_item = HotItem(title,url, cate = types['huxiu'],desc=desc)
            self.arr.append(hot_item)
        hot_collection.delete_many({'cate':types['huxiu']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 11
0
 def run(self):
     super().run()
     headers = HEADERS.copy()
     headers['Upgrade-Insecure-Requests'] = '1'
     headers['Referer'] = 'https://www.guokr.com/scientific/'
     headers['Host'] = 'www.guokr.com'
     res = requests.get(guokr_url)
     res.encoding = 'utf-8'
     soup = BeautifulSoup(res.text, 'html.parser')
     _list = soup.select('div.article')
     for item in _list:
         a_tag = item.find('a', class_='article-title')
         if a_tag:
             title = a_tag.text.strip()
             url = a_tag.get('href')
             desc = item.find('p', class_='article-summary').text.strip()
             hot_item = HotItem(title, url, cate=types['guokr'], desc=desc)
             self.arr.append(hot_item)
         else:
             continue
     hot_collection.delete_many({'cate': types['guokr']})
     hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 12
0
    def run(self):
        super().run()
        headers = HEADERS.copy()
        headers['Upgrade-Insecure-Requests'] = '1'
        headers[
            'Referer'] = 'http://bbs.tianya.cn/list.jsp?item=funinfo&grade=3&order=1'
        headers['Host'] = 'bbs.tianya.cn'
        res = requests.get(tinaya_url, headers=HEADERS)

        soup = BeautifulSoup(res.text, 'html.parser')

        _list = soup.select('td.td-title')

        for item in _list:
            a_tag = item.find('a')
            title = a_tag.text.strip()
            url = 'http://bbs.tianya.cn' + a_tag.get('href')

            hot_item = HotItem(title, url, cate=types['tianya'])
            self.arr.append(hot_item)

        hot_collection.delete_many({'cate': types['tianya']})
        hot_collection.insert_many([vars(item) for item in self.arr])
Exemplo n.º 13
0
 def run(self):
     super().run()
     headers = HEADERS.copy()
     headers['Upgrade-Insecure-Requests'] = '1'
     headers['Referer'] = 'https://www.qdaily.com/tags/30.html'
     headers['Host'] = 'www.qdaily.com'
     res = requests.get(qdaily_url, headers=headers)
     res.encoding = 'utf-8'
     soup = BeautifulSoup(res.text, 'html.parser')
     _list = soup.select('div.packery-item.article')
     for item in _list:
         title_tag = item.find('h3', class_='title')
         if title_tag:
             title = title_tag.text.strip()
             url = 'https://qdaily.com' + item.find(
                 'a', class_='com-grid-banner-article').get('href')
         else:
             a_tag = item.find('a', class_='com-grid-article')
             url = 'https://qdaily.com' + a_tag.get('href')
             title = a_tag.find('h3', class_='smart-dotdotdot').text.strip()
         hot_item = HotItem(title, url, cate=types['qdaily'])
         self.arr.append(hot_item)
     hot_collection.delete_many({'cate': types['qdaily']})
     hot_collection.insert_many([vars(item) for item in self.arr])