def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertGlobalItemHour(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'globalitem' if self.jm_queue_type == 'main': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPageGlobal(_val) # 汇聚 self.push_back(self.items, item.outGlobalSql()) iteminfoSql = item.outGlobalSql() _iteminfosql_list.append(iteminfoSql) if self.insertGlobalItemHour(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.jm_queue_type) # self.mongofsAccess.insertJMPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))
def crawl(self): # item sql list _iteminfosql_list = [] _itemdaysql_list = [] _itemhoursql_list = [] _itemupdatesql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] break item = None obj = 'item' if self._q_type == 'main': # 新商品实例 item = Item() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 # redis #self.putItemDB(item) self.push_back(self.items, item.outSql()) # 入库 iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] # 存网页 #if item: # _pages = item.outItemPage(obj, self._q_type) # self.mongofsAccess.insertJMPages(_pages) # 延时 time.sleep(1) # 通知queue, task结束 self.queue.task_done() except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) Common.log('# Invalid page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1: Common.log(_data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,40))