示例#1
0
    def __init__(self):
        self.headers_one = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            # 'Connection':'keep-alive',
            'Cookie':
            'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9',
            'Host': 'www.yidianzixun.com',
            'Referer': 'http://www.yidianzixun.com/channel/c11',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        self.proxies = ['218.95.55.154:4243']

        # 去重列表
        self.set_list = []
        #
        self.error_url_list = []

        # cookies池和id池
        '''
            {
                'channel_id': '',
                'cookies': ''
            },
        '''

        # 通过系统时间自动计算时间间隔
        date = datetime.now() - timedelta(days=3)  # 七天前的时间,不包括今天,
        str_time = str(date).split(' ')[0]

        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间

        now_time = str(yesterday).split(' ')[0]
        print('爬取时间段:{}到{}'.format(str_time, now_time))

        logging.info('爬取时间段:{}到{}'.format(str_time, now_time))
        # 定义开始时间 y-m-d  离现在时间远
        self.start_time = str_time
        # 定义结束时间 y-m-d  离现在时间近
        self.end_time = now_time
        try:
            self.page_ip = proxies.res_ip()
            # self.page_ip = '115.219.77.241:2316'
        except:
            time.sleep(3)
            print('调用ip时发生错误:{}'.format(traceback.format_exc()))
            logger.error('调用ip时发生错误:{}'.format(traceback.format_exc()))
            self.page_ip = proxies.res_ip()
        self.ip_count = 0
示例#2
0
    def run(self):

        url = 'http://www.yidianzixun.com/home/q/news_list_for_channel'
        get_time = time.time()
        get_time = ''.join(str(get_time).split('.'))

        url_list = [
            # 体育
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171666&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C%3C%3C%3A%3B%3A&appid=web_yidian&_={}',
            # NBA
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171682&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C28%3A%3B%3A&appid=web_yidian&_={}',
            # 财经
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171698&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C32%3A%3B%3A&appid=web_yidian&_={}'
        ]
        for get_url in url_list:
            for i in range(2):
                try:
                    for j in range(30):
                        url = get_url.format(
                            str(time.time()).replace('.', '')[:-4])
                        try:
                            self.get_news_list_port(url)
                        except requests.exceptions.ProxyError:
                            print(traceback.format_exc())
                            break

                except TypeError:
                    print(traceback.format_exc())
                    logger.error('内容解析错误')
                except:
                    print(traceback.format_exc())
                    logger.error('其他错误')

                time.sleep(10)
                self.page_ip = proxies.res_ip()
示例#3
0
    def get_channel_id(self):
        url = 'http://www.yidianzixun.com/channel/c11'
        try:
            response = requests.get(url,
                                    proxies={'http': self.page_ip},
                                    timeout=30)
            data = response.content.decode()
            data = re.search('channel_id(.*?)汽车', data).group(0)
            channel_id = re.search('\d{8,15}', data).group(0)
            cokies = response.headers['Set-Cookie']
            print(cokies)
            id = re.search('JSESSIONID=([a-z0-9]{30,80});', cokies).group(1)

            return channel_id, id
        except:
            print(traceback.format_exc())

            if self.ip_count < 10:
                self.page_ip = proxies.res_ip()
                print('跟换ip中: ', self.page_ip)
                self.ip_count += 1
                time.sleep(5)
                self.get_channel_id()
            else:
                raise IndexError
示例#4
0
    def run(self):

        url = 'http://www.yidianzixun.com/home/q/news_list_for_channel'
        get_time = time.time()
        get_time = ''.join(str(get_time).split('.'))
        get_time = get_time[:-2]
        start = 0
        for i in range(1, 16):
            time.sleep(5)
            print('第{}次大循环'.format(str(i)))
            try:
                channel_id, jession_id = self.get_channel_id()
                print(channel_id)
                for j in range(40):
                    # tool = random.choice(self.tool_list)
                    # cookie = 'JSESSIONID={}; wuid=955594968988162; wuid_createAt=2018-12-28 9:23:51; weather_auth=2; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1545960232; UM_distinctid=167f26914dc782-0cf8055a8c462f-5d1e331c-15f900-167f26914dd486; CNZZDATA1255169715=841120563-1545955593-null%7C1545955593; captcha=s%3A6cb2d7fd90216ee034e5063e173e8bd8.ytEeuzpbuzDpjq0fMjQB99MS3QYW4tagEAy6RClglQc; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1545960241; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%22167f26914dc782-0cf8055a8c462f-5d1e331c-15f900-167f26914dd486%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201545960240%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201545960240%7D%7D'.format(jession_id)
                    cookie = 'UM_distinctid=1683bb2492431a-094bc6de6a2a34-5d1e331c-15f900-1683bb249257ee; JSESSIONID={}; wuid=789835887113804; wuid_createAt=2019-01-16 9:36:16; weather_auth=2; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1547602576; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1547602576; CNZZDATA1255169715=432644633-1547597213-http%253A%252F%252Fwww.yidianzixun.com%252F%7C1547597213; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%221683bb2492431a-094bc6de6a2a34-5d1e331c-15f900-1683bb249257ee%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547602597%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547602597%7D%7D; captcha=s%3A229eaf5dc322b2b9390527804ef8124c.yGCMPLr%2FjIqPwCriUMcf0ZznNz7D3ZAabmz2DZWLIVM'.format(
                        jession_id)
                    # print(cookie)
                    ip = self.page_ip
                    # try:

                    spt = self.get_spt(start, channel_id)  # 生成加密参数
                    print(spt)

                    end = start + 10
                    params = {
                        'channel_id': channel_id,
                        'cstart': start,
                        'cend': end,
                        'infinite': 'true',
                        'refresh': '1',
                        '__from__': 'pc',
                        'multi': '5',
                        '_spt': spt,
                        'appid': 'web_yidian',
                        '_': get_time
                    }
                    self.get_news_list_port(url, params, cookie, ip)
                    # except:
                    #     pass
            except TypeError:
                # print('暂停900s')
                # time.sleep(900)
                logger.error('内容解析错误', traceback.format_exc())
            except:
                logger.error('其他错误', traceback.format_exc())
            self.page_ip = proxies.res_ip()

        # 为了提升爬取数量,从百度采集一次
        logger.info('开始通过百度爬取数据......')
        for i in range(0, 25):
            i = i * 10
            self.get_news_url(str(i))

        print(list(set(self.error_url_list)))
        logger.info('爬虫爬取完毕......')
示例#5
0
    def get_news_list_port(self, url, params, cookie, ip):
        self.headers_one['Cookie'] = cookie
        response = requests.get(url,
                                params=params,
                                headers=self.headers_one,
                                proxies={'https': ip})
        print(response.url)
        data = response.content.decode()
        data = json.loads(data)
        data = data['result']
        print(data)
        if data:
            for news in data:
                item = {}
                title = news['title']
                item['title'] = title
                itemid = news['itemid']
                url = 'http://www.yidianzixun.com/article/' + itemid
                news_date = news['date']
                get_date = re.search('\d{4}-\d{2}-\d{2}', news_date).group(0)
                if 'V_' not in itemid:
                    if url not in self.set_list:
                        # self.write_news_jsonfile(item)
                        try:
                            print(url)
                            self.get_news_page_info(url)
                            self.set_list.append(url)
                        except IndexError:
                            print('网页解析错误', url, 111111)
                            self.error_url_list.append(url)
                            self.page_ip = proxies.res_ip()
                            time.sleep(10)
                            print('更换ip:', self.page_ip)
        else:
            time.sleep(10)
            print('重试中......')
            self.page_ip = proxies.res_ip()

            self.get_news_list_port(url, params, cookie, ip)
示例#6
0
    def get_news_url(self, carts):
        """
        从百度搜索关键词,然后获取符合的新闻的url
        :param carts:
        :return:
        """
        # 时间
        get_time = time.time()
        str_time = str(get_time)[:-4]
        date = datetime.now() - timedelta(days=7)
        a = str(date)[:-7]
        timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
        # 转换为时间戳:
        timeStamp = int(time.mktime(timeArray))
        end_time = str(timeStamp) + '.' + str_time.split('.')[1]
        print(str_time, end_time)
        url = 'https://www.baidu.com/s?q1={}&q2=&q3=&q4=&gpc=stf%3D{}%2C{}%7Cstftype%3D1&ft=&q5=&q6=xiaohongshu.com&tn=baiduadv'.format(
            str(carts), end_time, str_time)
        print(url)
        # ip = random.choice(self.proxies_list)
        response = requests.get(url,
                                headers=self.headers_one,
                                verify=False,
                                timeout=30)  # , proxies={'https': ip}
        content = etree.HTML(response.content.decode())
        if content.xpath('.//h3[@class="t"]/a/@href'):
            url_list = content.xpath('.//h3[@class="t"]/a/@href')
            print(url_list)
            for url in url_list:
                news_url = url.replace('http', 'https')
                try:
                    self.get_news_page(news_url)
                except KeyError:
                    print('ip可能被封')
                except IndexError:
                    print('可能出现滑动验证')
                    if self.ip_count < 100:
                        print('更换ip中......')
                        self.ip = proxies.res_ip()
                        self.ip_count += 1
                        self.get_news_page(news_url)
                    else:
                        print('ip超过使用量')

                # time.sleep(2)
        elif content.xpath('.//div[@class="content_none"]/div/p//text()'):
            txt = content.xpath('.//div[@class="content_none"]/div/p//text()')
            print(txt)
        else:
            print('其他错误', url)
        print('------------------------------------------------')
示例#7
0
    def run(self):

        for url in open('./../toutiao/new_url_file.json'):
            if self.ip_count < 150:
                url = url.strip()
                print('一个爬虫正在爬取网址{}'.format(url))
                logger.info('一个爬虫正在爬取网址{}'.format(url))
                try:
                    self.get_news_page(url, self.ip)
                except requests.exceptions.ProxyError:
                    print('远程连接无响应,重试一次中.......', )
                    try:
                        if self.ip_count < 150:
                            print('更换ip中......')
                            self.ip = proxies.res_ip()
                            self.ip_count += 1
                            self.get_news_page(url, self.ip)
                        else:
                            self.get_news_page(url, self.ip)
                    except requests.exceptions.ProxyError:
                        print('重试链接....远程连接无响应......')
                except Exception as e:
                    print('发生其他异常{}'.format(e))
                    print('更换ip中......')
                    time.sleep(10)
                    self.ip = proxies.res_ip()
                    self.ip_count += 1
                    try:
                        self.get_news_page(url, self.ip)
                    except:
                        pass
                time.sleep(1)
                print('一个网址爬虫结束.....')
            else:
                print('使用ip已达到{}个'.format(str(self.ip_count)) +
                      ',爬虫停止运行......')
                break
        logger.info('爬取完毕......')
示例#8
0
    def __init__(self, file_path, comment_path):
        self.headers_two = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            # 'Connection':'keep-alive',
            'Cookie':
            'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9',
            'Host': 'www.baidu.com',
            'Referer': 'http://www.yidianzixun.com/channel/c11',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        self.proxies = ['218.95.55.154:4243']

        # 去重列表
        self.set_list = []
        #
        self.error_url_list = []
        self.headers_one = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.baidu.com',
            # 'Proxy-Connection': 'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
        ]

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        try:
            self.page_ip = proxies.res_ip()
            print('ip: ', self.page_ip)
            # self.page_ip = '116.248.160.138:4261'
        except:
            time.sleep(3)
            print('调用ip时发生错误:{}'.format(traceback.format_exc()))
            logger.error('调用ip时发生错误:{}'.format(traceback.format_exc()))
            self.page_ip = proxies.res_ip()
        self.ip_count = 0

        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        self.is_get_comment = True

        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.comment_apth,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
示例#9
0
    def __init__(self):

        self.headers_one = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.baidu.com',
            # 'Proxy-Connection': 'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }

        self.headers_two = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            # 'Connection': 'keep-alive',
            'Host':
            'www.baidu.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }
        self.headers_three = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            # 'Connection': 'keep-alive',
            'Host':
            'www.xiaohongshu.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''
        # # 打开json文件
        # self.news_jsonfile = open('./sina_newsfile.json', 'wb')
        # self.comment_jsonfile = open('./sina_commentfile.json', 'wb')
        date = datetime.now() - timedelta(days=3)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, yesterday))

        logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # 标记爬虫工作
        self.is_work = True
        # ip代理
        self.proxies_list = [
            '121.231.226.210:4252',
        ]

        self.ip = proxies.res_ip()

        self.ip_count = 0

        # url去重list
        self.set_list = []
示例#10
0
    def __init__(self):

        # 'cookie':'uuid="w:d0214807f672416fb7d3ee0431aa13a3"; UM_distinctid=1674ef3a9800-0bce565d4c8dc4-414f0120-15f900-1674ef3a981290; _ga=GA1.2.823209007.1543222670; _gid=GA1.2.547615301.1543222670; CNZZDATA1259612802=603836554-1543213069-%7C1543218469; __tasessionId=tpisw88851543281460530; csrftoken=d9a6dad7de6c1fbbf3ddd1a3de811481; tt_webid=6628070185327625741',
        # ':authority':'www.toutiao.com',
        # ':method':'GET',
        # ':path':'/api/pc/feed/?category=news_car&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1E56B7F8CD9B35&cp=5BFC39BB43B5DE1&_signature=pMmtcAAA.0TvpJ9rFvhWIKTJrW',
        # ':scheme':'https',
        # 'cache-control': 'max-age=0',
        # 'cookie': 'tt_webid=6628733243796178436; tt_webid=6628733243796178436; csrftoken=3a6f2dc0f315bd1fe957319a75bba4ed; uuid="w:2203d39caf3249c0bcda19ee5839b850"; UM_distinctid=1675827673a27a-0dd556679b3f63-3a3a5d0c-15f900-1675827673b22c; __tasessionId=qb2c0x9mb1543386267822; CNZZDATA1259612802=992935523-1543369669-%7C1543385869',
        # 'referer': 'https://www.toutiao.com/ch/news_car/',
        self.headers_one = {
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'zh-CN,zh;q=0.9',
            'cache-control':
            'max-age=0',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }

        self.start_url = 'https://www.toutiao.com/api/pc/feed/'
        # 评论接口模板
        self.commnet_port_url = ''

        date = datetime.now() - timedelta(days=3)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, yesterday))

        logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday))
        # 定义开始时间 y-m-d  离现在时间远
        news_start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近
        new_end_time = yesterday

        # 定义开始时间 y-m-d
        self.start_time = news_start_time
        # 定义结束时间 y-m-d
        self.end_time = new_end_time

        # 标记爬虫工作
        self.is_work = True
        # 评论页数
        self.comment_page_num = 1
        # 去重列表
        self.set_list = []
        # 代理ip
        self.proxies = [
            '112.245.235.249:4243',
            # '59.53.47.4:4249'
        ]
        # 搜集问答类网页的列表
        self.questions_list = []

        # 读取url列表
        with open('./../toutiao/new_url_file.json', 'r') as f:
            self.url_list = f.readlines()

        # 获取ip
        self.ip = proxies.res_ip()

        # ip计数
        self.ip_count = 0