예제 #1
0
 def get_search_article(self, keyword, offset=0):
     keyword = urllib.request.quote(keyword)
     req_url = "https://www.toutiao.com/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab".format(
         offset, keyword)
     headers = {
         'User-Agent':
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
         'Connection':
         'keep-alive',
         'authority':
         'www.toutiao.com',
         'referer':
         "https://www.toutiao.com/search/?keyword={}".format(keyword),
         'method':
         'GET',
         'path':
         "/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab"
         .format(offset, keyword),
         'scheme':
         'https'
     }
     self.s.headers.update(headers)
     req = self.s.get(req_url, proxies=get_proxy_ip())
     time.sleep(random.random() * 2 + 3)
     data = json.loads(req.text)
     items = data['data']
     if data['has_more'] == 1:
         self.page = self.page + 1
         offset = 20 * self.page
         self.parse_data(items)
         time.sleep(2)
         self.get_search_article(keyword, offset)
     else:
         self.parse_data(items)
         toutiaodb.save(self.search_item_list)
예제 #2
0
 def fetch_user_articles(self, user, browser):
     honey = json.loads(self.get_js())
     signature = honey['_signature']
     max_behot_time = "0"
     _as = honey['as']
     cp = honey['cp']
     if self.user_page > 0:
         signature = browser.execute_script("return window.TAC.sign(" +
                                            user.user_id + max_behot_time +
                                            ")")
     headers = {
         'User-Agent':
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
         'Connection':
         'keep-alive',
         'authority':
         'www.toutiao.com',
         'referer':
         user.media_url,
         'method':
         'GET',
         'path':
         "/c/user/article/?page_type=1&user_id={}&max_behot_time={}&count=20&as={}&cp={}&_signature={}"
         .format(user.user_id, max_behot_time, _as, cp, signature),
         'scheme':
         'https'
     }
     self.s.headers.update(headers)
     req_url = "https://www.toutiao.com/c/user/article/?page_type=1&user_id={}&max_behot_time={}&count=20&as={}&cp={}&_signature={}".format(
         user.user_id, max_behot_time, _as, cp, signature)
     req = self.s.get(req_url, proxies=get_proxy_ip())
     # 通过随机数控制请求速度
     time.sleep(random.random() * 2 + 2)
     data = json.loads(req.text)
     max_behot_time = str(data['next'][max_behot_time])
     if data['has_more']:
         self.user_page = self.user_page + 1
         self.parse_user_artcle(data['data'], toutiaoitem.user_id,
                                toutiaoitem.media_url)
         #在休眠2s
         time.sleep(2)
         self.fetch_user_articles(user, browser)
     else:
         self.parse_user_artcle(data['data'], toutiaoitem.user_id,
                                toutiaoitem.media_url)
         toutiaodb.save(self.user_artcile_list)
예제 #3
0
    def get_channel_data(self, page):  #获取数据
        req = self.s.get(url=self.url, verify=False, proxies=get_proxy_ip())
        #print (self.s.headers)
        #print(req.text)
        headers = {'referer': self.url}
        max_behot_time = '0'
        signature = '.1.hXgAApDNVcKHe5jmqy.9f4U'
        eas = 'A1E56B6786B47FE'
        ecp = '5B7674A7FF2E9E1'
        self.s.headers.update(headers)
        item_list = []
        browser = webdriver.Chrome()
        browser.implicitly_wait(10)
        browser.get(self.url)
        for i in range(0, page):

            Honey = json.loads(self.get_js())
            # eas = self.getHoney(int(max_behot_time))[0]
            # ecp = self.getHoney(int(max_behot_time))[1]
            eas = Honey['as']
            ecp = Honey['cp']
            signature = Honey['_signature']
            if i > 0:
                signature = browser.execute_script("return window.TAC.sign(" +
                                                   max_behot_time + ")")
            url = 'https://www.toutiao.com/api/pc/feed/?category={}&utm_source=toutiao&widen=1&max_behot_time={}&max_behot_time_tmp={}&tadrequire=true&as={}&cp={}&_signature={}'.format(
                self.channel, max_behot_time, max_behot_time, eas, ecp,
                signature)
            req = self.s.get(url=url, verify=False, proxies=get_proxy_ip())
            time.sleep(random.random() * 2 + 2)
            # print(req.text)
            # print(url)
            j = json.loads(req.text)
            for k in range(0, 10):
                item = toutiaoitem()
                now = time.time()
                if j['data'][k]['tag'] != 'ad' or j['data'][k][
                        'tag'] != 'ad.platform.site':
                    item.title = j['data'][k]['title']  ##标题
                    item.source = j['data'][k]['source']  ##作者
                    item.source_url = 'https://www.toutiao.com/' + j['data'][
                        k]['source_url']  ##文章链接
                    item.media_url = 'https://www.toutiao.com/' + j['data'][k][
                        'media_url']  #作者主页
                    item.article_genre = j['data'][k]['article_genre']  #文章类型
                    try:
                        item.comments_count = j['data'][k][
                            'comments_count']  ###评论
                    except:
                        item.comments_count = 0

                    item.tag = j['data'][k]['tag']  ###频道名
                    try:
                        item.chinese_tag = j['data'][k]['chinese_tag']  ##频道中文名
                    except:
                        item.chinese_tag = ''
                    try:
                        item.label = j['data'][k]['label']  ## 标签
                    except:
                        item.label = []
                    try:
                        item.abstract = j['data'][k]['abstract']  ###文章摘要
                    except:
                        item.abstract = ''
                    behot = int(j['data'][k]['behot_time'])
                    item.behot_time = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(behot))  ####发布时间
                    item.collect_time = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(now))  ##抓取时间
                    item.item_id = j['data'][k]['item_id']
                    try:
                        item.image_list = j['data'][k]['image_list']
                    except:
                        item.image_list = []
                    item.image_url = j['data'][k]['image_url']
                    item.middle_image = j['data'][k]['middle_image']
                item_list.append(item)
            toutiaodb.save(item_list)
            time.sleep(2)
            max_behot_time = str(j['next']['max_behot_time'])