def _push_task(self): TDDCLogging.info('--->Task Output Producer Was Ready.') while True: task = self._queues.TASK_OUTPUT.get() if not self._push(self._site.TASK_OUTPUT_TOPIC, task): TDDCLogging.error('Push Task Failed.') else: self.pushed(task)
def _parse(self): if not self._json_dict.get('success'): TDDCLogging.warning('Crawled[{}:{}] Failed.'.format( self._task.platform, self._task.url)) return data = self._json_dict.get('data') if not data: TDDCLogging.warning('Crawled[{}:{}] Exception.'.format( self._task.platform, self._task.url)) return self._get_detail_extra_info(data)
def _consume_msg_exp(self, exp_type, info, exception=None): if 'JSON_ERR' in exp_type: TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' + info + '\n' + exception.message + '\n' + '*' * (10 + len(exp_type)) + '\n') elif 'TASK_ERR' in exp_type or 'EVENT_ERR' in exp_type: TDDCLogging.error('*' * 5 + exp_type + '*' * 5 + '\nException: ' + 'item={item}\n'.format(item=info) + 'item_type={item_type}\n'.format( item_type=type(info)) + '*' * (10 + len(exp_type)) + '\n')
def _parse(self): if not self._json_dict.get('success'): TDDCLogging.warning('Crawled[{}:{}] Failed.'.format(self._task.platform, self._task.url)) return data = self._json_dict.get('data') if not data: TDDCLogging.warning('Crawled[{}:{}] Exception.'.format(self._task.platform, self._task.url)) return if data.get('pageIndex') == 1: self._make_bid_list_tasks(data) self._make_detail_task(data.get('data'))
def _push(self, topic, task, times=0): if not task: return False msg = json.dumps(task.__dict__) if msg: try: if self.ready_to_push(task): self._task_output_producer.send(topic, msg) except Exception, e: TDDCLogging.warning('Push Task Field: ' + e.message) gevent.sleep(1) if times == 10: return False return self._push(topic, task, times + 1) else: return True
def _make_detail_task(self, data): referer_base_url = 'https://www.weidai.com.cn/bid/showBidDetail?hash={hash}' base_url = 'https://www.weidai.com.cn/bid/bidDetail?hash={hash}&bid=' for detail_info in data: path = detail_info.get('hash') if not path: TDDCLogging.warning('Path Is None.') return task = Task() task.url = base_url.format(hash=path) task.platform = self.platform task.feature = 'weidai.bid_detail' task.headers = {'Referer': referer_base_url.format(hash=path), 'X-Requested-With': 'XMLHttpRequest'} self._md5_mk.update(task.url) task.row_key = self._md5_mk.hexdigest() self.tasks.append(task)
def _fetch_task(self): TDDCLogging.info('--->Task Input Consumer Was Ready.') pause = False while True: if self._queues.TASK_INPUT.qsize( ) > self._site.LOCAL_TASK_QUEUE_SIZE: if not pause: self._task_input_consumer.commit() self._task_input_consumer.unsubscribe() pause = True TDDCLogging.info('Task Input Consumer Was Paused.') gevent.sleep(1) continue if pause and self._queues.TASK_INPUT.qsize( ) < self._site.LOCAL_TASK_QUEUE_SIZE / 2: self._task_input_consumer.subscribe( self._site.TASK_INPUT_TOPIC) pause = False TDDCLogging.info('Task Input Consumer Was Resumed.') partition_records = self._task_input_consumer.poll(2000, 16) if not len(partition_records): gevent.sleep(1) continue for _, records in partition_records.items(): for record in records: self._record_proc(record)