예제 #1
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行页面访问
        newUrl = 'http://tech.qq.com/l/scroll.htm'
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'demoName_page_list',
                                 'url': newUrl
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)
예제 #2
0
    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1'
        # url = 'http://tech.sina.com.cn/t/2017-07-24/doc-ifyihrit1274195.shtml'

        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList)

        # 补缺补漏
        url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254'
        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList2)
예제 #3
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'第一财经'
        sub_channel = u'新闻'
        # 进行页面访问
        newUrl = 'http://www.yicai.com/news/'
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'diyicaijing_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseArticleList, dont_filter=True)
    def start_requests(self):
        # TODO..加上while可能有问题,有些抓不到

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 进行爬虫
        # 获取源  可用的,且(是更新失败的,或者最新的同时更新时间跟当前相比大于40分钟)
        sources = self.wxSourceDao.queryEnable_special(isRandom=True, wx_accounts=['CINNO_CreateMore'])

        for source in sources:
            # 更新当前条状态为 更新中,如果更新失败或者被绊则更新为更新失败,更新成功之后设置为成功
            (wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source
            # 更新状态为更新中
            self.wxSourceDao.updateStatus(wx_account, 'updating')
            # 进行页面访问
            url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query='
            newUrl = url + wx_account
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={'request_type': 'weixin_source', 'url': newUrl,
                                       'wx_account': wx_account, 'source': source},
                                 callback=self.parseList, dont_filter=True)
예제 #5
0
    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        for page in range(1, 2):
            r = random.uniform(0, 1)
            url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&callback=&_=1501148356254&page='
            newUrl = url + str(page) + '&r=' + str(r)
            self.logDao.info(u"开始抓取列表:" + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'sina_list',
                                     'url': newUrl
                                 },
                                 callback=self.parseList2)
예제 #6
0
    def start_requests(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报
        cids = ['6', '66', '73', '84', '100', '119', '120', '121', '122']
        # 必读
        url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page='
        for cid in cids:
            for page in range(1, 2):
                newUrl = url + str(page) + ('&cid=' + str(cid))
                self.logDao.warn(u'进行抓取列表:' + newUrl)
                yield scrapy.Request(url=newUrl,
                                     meta={
                                         'request_type': 'jiemian_page_list',
                                         'url': newUrl
                                     },
                                     callback=self.parseArticleList,
                                     dont_filter=True)
예제 #7
0
    def start_requests(self):
        # 如果在晚上12点到早上6点不爬
        hour = datetime.datetime.now().hour
        if 0 <= hour <= 6:
            self.logDao.info(u'这个时间不爬。0-6点')
            return

        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 获取源  可用有值
        sources = self.wxSourceDao.queryWxUrl(isRandom=True)

        # 排序优先
        update_time, brokenAccounts = self.getBrokenAccounts()
        firstGroup = []
        secondGroup = []
        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            if wx_account in brokenAccounts:
                firstGroup.append(source)
            else:
                secondGroup.append(source)
        sources = firstGroup + secondGroup

        self.wxSources = sources
        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            # 进行页面访问
            newUrl = wx_url
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'weixin_page_list',
                                     'source_url': newUrl,
                                     'wx_account': wx_account,
                                     'source': source,
                                     'wx_account_id': id
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)
예제 #8
0
    def start_requests(self):
        # while True:
        # 检测网络
        if not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        if not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        if self.request_stop:
            # 拨号生效时间不定,所以需要间隔一段时间再重试
            timeSpace = time.time() - self.request_stop_time
            if timeSpace / 60 <= 2:
                # 当时间间隔小于 2分钟 就不请求
                # continue
                pass
            else:
                self.request_stop = False

        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=30&spec=&type=&ch=05&k' \
              '=&offset_page=0&offset_num=0&num=60&asc=&page='

        for page in range(0, 11):
            if self.request_stop:
                self.logDao.warn(u'出现被绊或者出现网络异常,退出循环')
                # 当网络出现被绊的情况,就需要停止所有的请求等待IP更换
                break
            r = random.uniform(0, 1)
            newUrl = url + str(page)
            newUrl += ('&r=' + str(r))
            self.logDao.info(u"开始抓取列表:" + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={'url': newUrl},
                                 callback=self.parseList)
            # 跑空线程2秒
            TimerUtil.sleep(2)

        if self.request_stop:
            # 需要发起通知 进行重新拨号
            self.logDao.warn(u'发送重新拨号信号,请等待2分钟会尝试重新抓取')
            self.request_stop_time = time.time()
            pass
        else:
            # 正常抓好之后,当前跑空线程10分钟,不影响一些还没请求完成的request
            self.logDao.info(u'请求了一轮了,但是可能还有没有请求完成,睡一会10分钟')
            TimerUtil.sleep(10 * 60)
            pass
예제 #9
0
    def start_requests(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'网易科技'
        sub_channel = u'科技'
        # 进行页面访问
        newUrl = 'http://tech.163.com/special/00094IHV/news_json.js?' + str(
            random.uniform(0, 1))
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'wangyi_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        src_channel = u'网易财经'
        sub_channel = u'财经'
        # 进行页面访问
        newUrl = 'http://money.163.com/special/00251G8F/news_json.js?' + str(
            random.uniform(0, 1))
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'wangyi_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)
예제 #10
0
    def start_requests(self):
            # 检测网络
            while not NetworkUtil.checkNetWork():
                # 20s检测一次
                TimerUtil.sleep(20)
                self.logDao.warn(u'检测网络不可行')

            # 检测服务器
            while not NetworkUtil.checkService():
                # 20s检测一次
                TimerUtil.sleep(20)
                self.logDao.warn(u'检测服务器不可行')

            # 进行页面访问
            src_channel = u'腾讯科技'
            sub_channel = u'科技'
            newUrl = 'http://tech.qq.com/l/scroll.htm'
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'tengxun_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList, dont_filter=True)

            # 进行页面访问
            src_channel = u'腾讯财经'
            sub_channel = u'财经要闻'
            newUrl = 'http://finance.qq.com/'
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'tengxun_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList2, dont_filter=True)
예제 #11
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        src_channel = u'搜狐科技'
        sub_channel = u'科技'

        for page in range(1, 4):
            # 进行页面访问
            url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&size=20&callback=&_=1502075449669&page='
            newUrl = url + str(page)
            self.logDao.warn(u'进行抓取列表:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'sohu_page_list',
                                     'url': newUrl,
                                     'src_channel': src_channel,
                                     'sub_channel': sub_channel
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)
    def start_requests(self):
        # unKnow = ["didalive", "HIS_Technology", "ad_helper", "zhongduchongdu"]; 是搜索不到的
        # TODO..加上while可能有问题,有些可能抓不到
        # 如果正在爬,就不请求
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')

        # 获取源  可用有值
        sources = self.wxSourceDao.queryWxUrl_special(isRandom=True,
                                                      wx_accounts=['qqtech'])

        for source in sources:
            (id, wx_name, wx_account, wx_url, wx_avatar, update_status,
             is_enable, update_time) = source
            # 进行页面访问
            newUrl = wx_url
            self.logDao.warn(u'进行抓取:' + newUrl)
            yield scrapy.Request(url=newUrl,
                                 meta={
                                     'request_type': 'weixin_page_list',
                                     'source_url': newUrl,
                                     'wx_account': wx_account,
                                     'source': source,
                                     'wx_account_id': id
                                 },
                                 callback=self.parseArticleList,
                                 dont_filter=True)
예제 #13
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报
        cids = [{
            'src_channel': u'界面科技',
            'sub_channel': u'必读',
            'num': '6'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'玩物',
            'num': '66'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'产品榜',
            'num': '73'
        }, {
            'src_channel': u'界面科技',
            'sub_channel': u'快报',
            'num': '84'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'游戏要闻',
            'num': '100'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'单品',
            'num': '119'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'盘点',
            'num': '120'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'花边要闻',
            'num': '121'
        }, {
            'src_channel': u'界面游戏',
            'sub_channel': u'游戏快报',
            'num': '122'
        }]
        # 必读
        url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page='
        for cid in cids:
            for page in range(1, 2):
                cidNum = cid.get('num')
                src_channel = cid.get('src_channel')
                sub_channel = cid.get('sub_channel')
                newUrl = url + str(page) + ('&cid=' + cidNum)
                self.logDao.warn(u'进行抓取列表:' + newUrl)
                yield scrapy.Request(url=newUrl,
                                     meta={
                                         'request_type': 'jiemian_page_list',
                                         'url': newUrl,
                                         'src_channel': src_channel,
                                         'sub_channel': sub_channel
                                     },
                                     callback=self.parseArticleList,
                                     dont_filter=True)
예제 #14
0
    def start_requests(self):
        # while True:
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
            # continue

        src_channel = u'新浪科技'
        sub_channel = u'科技'
        # 进行爬虫
        url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1'

        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'sina_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseList)

        src_channel = u'新浪科技'
        sub_channel = u'科技'
        # 补缺补漏
        url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254'
        r = random.uniform(0, 1)
        newUrl = url + ('&r=' + str(r))
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'sina_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseList2)

        # 新浪财经 要闻
        src_channel = u'新浪财经'
        sub_channel = u'要闻'
        url = 'http://finance.sina.com.cn/'
        newUrl = url
        self.logDao.info(u"开始抓取列表:" + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'sina_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel
                             },
                             callback=self.parseList3)
예제 #15
0
    def start_requests(self):
        # 如果正在爬,就不请求
        status = self.getStatus()
        if status == 'running':
            self.isRunningStop = True
            return
        self.saveStatus('running')

        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测网络不可行')

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logDao.warn(u'检测服务器不可行')
        src_channel = u'凤凰财经'

        sub_channel = u'电子竞技'
        url = 'http://games.ifeng.com/listpage/17886/1/list.shtml'
        styleUrlDefault = [
            'http://p2.ifengimg.com/a/2016/0523/esports.css',
            'http://y1.ifengimg.com/package/t_20130820__15953/css/pl_detail_v8.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList3,
                             dont_filter=True)

        sub_channel = u'产品资讯'
        url = 'http://games.ifeng.com/listpage/27456/1/list.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        sub_channel = u'热点资讯'
        url = 'http://games.ifeng.com/listpage/27455/1/list.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList,
                             dont_filter=True)

        src_channel = u'凤凰科技'
        sub_channel = u'资讯'
        url = 'http://tech.ifeng.com/listpage/800/0/1/rtlist.shtml'
        styleUrlDefault = [
            'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css'
        ]
        newUrl = url
        self.logDao.warn(u'进行抓取列表:' + newUrl)
        yield scrapy.Request(url=newUrl,
                             meta={
                                 'request_type': 'fenghuang_page_list',
                                 'url': newUrl,
                                 'src_channel': src_channel,
                                 'sub_channel': sub_channel,
                                 'styleUrlDefault': styleUrlDefault
                             },
                             callback=self.parseArticleList2,
                             dont_filter=True)