def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行页面访问 newUrl = 'http://tech.qq.com/l/scroll.htm' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'demoName_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1' # url = 'http://tech.sina.com.cn/t/2017-07-24/doc-ifyihrit1274195.shtml' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList) # 补缺补漏 url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'sina_list', 'url': newUrl}, callback=self.parseList2)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'第一财经' sub_channel = u'新闻' # 进行页面访问 newUrl = 'http://www.yicai.com/news/' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'diyicaijing_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # TODO..加上while可能有问题,有些抓不到 # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行爬虫 # 获取源 可用的,且(是更新失败的,或者最新的同时更新时间跟当前相比大于40分钟) sources = self.wxSourceDao.queryEnable_special(isRandom=True, wx_accounts=['CINNO_CreateMore']) for source in sources: # 更新当前条状态为 更新中,如果更新失败或者被绊则更新为更新失败,更新成功之后设置为成功 (wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 更新状态为更新中 self.wxSourceDao.updateStatus(wx_account, 'updating') # 进行页面访问 url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query=' newUrl = url + wx_account self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={'request_type': 'weixin_source', 'url': newUrl, 'wx_account': wx_account, 'source': source}, callback=self.parseList, dont_filter=True)
def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue for page in range(1, 2): r = random.uniform(0, 1) url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&callback=&_=1501148356254&page=' newUrl = url + str(page) + '&r=' + str(r) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl }, callback=self.parseList2)
def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报 cids = ['6', '66', '73', '84', '100', '119', '120', '121', '122'] # 必读 url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page=' for cid in cids: for page in range(1, 2): newUrl = url + str(page) + ('&cid=' + str(cid)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'jiemian_page_list', 'url': newUrl }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 如果在晚上12点到早上6点不爬 hour = datetime.datetime.now().hour if 0 <= hour <= 6: self.logDao.info(u'这个时间不爬。0-6点') return # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 获取源 可用有值 sources = self.wxSourceDao.queryWxUrl(isRandom=True) # 排序优先 update_time, brokenAccounts = self.getBrokenAccounts() firstGroup = [] secondGroup = [] for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source if wx_account in brokenAccounts: firstGroup.append(source) else: secondGroup.append(source) sources = firstGroup + secondGroup self.wxSources = sources for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 进行页面访问 newUrl = wx_url self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'weixin_page_list', 'source_url': newUrl, 'wx_account': wx_account, 'source': source, 'wx_account_id': id }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # while True: # 检测网络 if not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 if not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue if self.request_stop: # 拨号生效时间不定,所以需要间隔一段时间再重试 timeSpace = time.time() - self.request_stop_time if timeSpace / 60 <= 2: # 当时间间隔小于 2分钟 就不请求 # continue pass else: self.request_stop = False # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=30&spec=&type=&ch=05&k' \ '=&offset_page=0&offset_num=0&num=60&asc=&page=' for page in range(0, 11): if self.request_stop: self.logDao.warn(u'出现被绊或者出现网络异常,退出循环') # 当网络出现被绊的情况,就需要停止所有的请求等待IP更换 break r = random.uniform(0, 1) newUrl = url + str(page) newUrl += ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={'url': newUrl}, callback=self.parseList) # 跑空线程2秒 TimerUtil.sleep(2) if self.request_stop: # 需要发起通知 进行重新拨号 self.logDao.warn(u'发送重新拨号信号,请等待2分钟会尝试重新抓取') self.request_stop_time = time.time() pass else: # 正常抓好之后,当前跑空线程10分钟,不影响一些还没请求完成的request self.logDao.info(u'请求了一轮了,但是可能还有没有请求完成,睡一会10分钟') TimerUtil.sleep(10 * 60) pass
def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'网易科技' sub_channel = u'科技' # 进行页面访问 newUrl = 'http://tech.163.com/special/00094IHV/news_json.js?' + str( random.uniform(0, 1)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'wangyi_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True) src_channel = u'网易财经' sub_channel = u'财经' # 进行页面访问 newUrl = 'http://money.163.com/special/00251G8F/news_json.js?' + str( random.uniform(0, 1)) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'wangyi_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 进行页面访问 src_channel = u'腾讯科技' sub_channel = u'科技' newUrl = 'http://tech.qq.com/l/scroll.htm' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'tengxun_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True) # 进行页面访问 src_channel = u'腾讯财经' sub_channel = u'财经要闻' newUrl = 'http://finance.qq.com/' self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'tengxun_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList2, dont_filter=True)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'搜狐科技' sub_channel = u'科技' for page in range(1, 4): # 进行页面访问 url = 'http://v2.sohu.com/public-api/feed?scene=CHANNEL&sceneId=30&size=20&callback=&_=1502075449669&page=' newUrl = url + str(page) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sohu_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # unKnow = ["didalive", "HIS_Technology", "ad_helper", "zhongduchongdu"]; 是搜索不到的 # TODO..加上while可能有问题,有些可能抓不到 # 如果正在爬,就不请求 # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 获取源 可用有值 sources = self.wxSourceDao.queryWxUrl_special(isRandom=True, wx_accounts=['qqtech']) for source in sources: (id, wx_name, wx_account, wx_url, wx_avatar, update_status, is_enable, update_time) = source # 进行页面访问 newUrl = wx_url self.logDao.warn(u'进行抓取:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'weixin_page_list', 'source_url': newUrl, 'wx_account': wx_account, 'source': source, 'wx_account_id': id }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # 必读 玩物 产品榜 快报 游戏要闻 单品 盘点 花边要闻 游戏快报 cids = [{ 'src_channel': u'界面科技', 'sub_channel': u'必读', 'num': '6' }, { 'src_channel': u'界面科技', 'sub_channel': u'玩物', 'num': '66' }, { 'src_channel': u'界面科技', 'sub_channel': u'产品榜', 'num': '73' }, { 'src_channel': u'界面科技', 'sub_channel': u'快报', 'num': '84' }, { 'src_channel': u'界面游戏', 'sub_channel': u'游戏要闻', 'num': '100' }, { 'src_channel': u'界面游戏', 'sub_channel': u'单品', 'num': '119' }, { 'src_channel': u'界面游戏', 'sub_channel': u'盘点', 'num': '120' }, { 'src_channel': u'界面游戏', 'sub_channel': u'花边要闻', 'num': '121' }, { 'src_channel': u'界面游戏', 'sub_channel': u'游戏快报', 'num': '122' }] # 必读 url = 'https://a.jiemian.com/index.php?m=lists&a=ajaxlist&callback=&_=1502103362598&page=' for cid in cids: for page in range(1, 2): cidNum = cid.get('num') src_channel = cid.get('src_channel') sub_channel = cid.get('sub_channel') newUrl = url + str(page) + ('&cid=' + cidNum) self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'jiemian_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseArticleList, dont_filter=True)
def start_requests(self): # while True: # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # continue # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') # continue src_channel = u'新浪科技' sub_channel = u'科技' # 进行爬虫 url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=96&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=220&asc=&page=1' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseList) src_channel = u'新浪科技' sub_channel = u'科技' # 补缺补漏 url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=372&lid=2431&k=&num=50&page=1&callback=&_=1501148356254' r = random.uniform(0, 1) newUrl = url + ('&r=' + str(r)) self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseList2) # 新浪财经 要闻 src_channel = u'新浪财经' sub_channel = u'要闻' url = 'http://finance.sina.com.cn/' newUrl = url self.logDao.info(u"开始抓取列表:" + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'sina_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel }, callback=self.parseList3)
def start_requests(self): # 如果正在爬,就不请求 status = self.getStatus() if status == 'running': self.isRunningStop = True return self.saveStatus('running') # 检测网络 while not NetworkUtil.checkNetWork(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测网络不可行') # 检测服务器 while not NetworkUtil.checkService(): # 20s检测一次 TimerUtil.sleep(20) self.logDao.warn(u'检测服务器不可行') src_channel = u'凤凰财经' sub_channel = u'电子竞技' url = 'http://games.ifeng.com/listpage/17886/1/list.shtml' styleUrlDefault = [ 'http://p2.ifengimg.com/a/2016/0523/esports.css', 'http://y1.ifengimg.com/package/t_20130820__15953/css/pl_detail_v8.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList3, dont_filter=True) sub_channel = u'产品资讯' url = 'http://games.ifeng.com/listpage/27456/1/list.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList, dont_filter=True) sub_channel = u'热点资讯' url = 'http://games.ifeng.com/listpage/27455/1/list.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList, dont_filter=True) src_channel = u'凤凰科技' sub_channel = u'资讯' url = 'http://tech.ifeng.com/listpage/800/0/1/rtlist.shtml' styleUrlDefault = [ 'http://p0.ifengimg.com/fe/responsiveDetail/styles/pc_c38f5a0e.css' ] newUrl = url self.logDao.warn(u'进行抓取列表:' + newUrl) yield scrapy.Request(url=newUrl, meta={ 'request_type': 'fenghuang_page_list', 'url': newUrl, 'src_channel': src_channel, 'sub_channel': sub_channel, 'styleUrlDefault': styleUrlDefault }, callback=self.parseArticleList2, dont_filter=True)