def send_api_request(self, init_params): """Send request to api. :param dict init_params: Request params :return: http request response :rtype: str :raises ApiError: raise :class:`ApiError` if there are some error during connection. """ # gevent based api request if self._gevent_async: timeout = Timeout(self._timeout) timeout.start() try: #self.logger.debug('START') res = self._send_api_request(init_params) #self.logger.debug('STOP') except Timeout: err = 'Cloudstack api call timeout after : %ss' % self._timeout self.logger.error(err) raise ApiError(err) except ApiError: raise finally: timeout.cancel() # blocking api request else: res = self._send_api_request(init_params, timeout=self._timeout) return res
def _get_message_data(self): max_size = self.extensions.getparam('SIZE', filter=int) reader = DataReader(self.io, max_size) err = None timeout = Timeout(self.data_timeout) timeout.start() try: data = reader.recv() except ConnectionLost: raise except SmtpError as e: data = None err = e finally: timeout.cancel() reply = Reply('250', '2.6.0 Message Accepted for Delivery') self._call_custom_handler('HAVE_DATA', reply, data, err) self.io.send_reply(reply) self.io.flush_send() self.have_mailfrom = None self.have_rcptto = None
def __init__(self, url): self.url = url self.protocol, self.domain = self.url.split("://") #e.g. news.bbc.co.uk self.domain = self.domain.split('/')[0] self.site_data = sites[self.domain] self.total_words = {} timeout = Timeout(30, TimeoutError) timeout.start() try: self.html = self.read_url() except TimeoutError: print url + " timed out" return finally: timeout.cancel() self.text = self.boiler_extract() self.soup = BeautifulSoup(self.html, 'lxml') self.article = self.is_article() if self.article: self.calc_total_words() articles.put(self) self.find_links()
def create(self, usernames, passwords, target_iqn, target_lun, size, initiator_iqn_list): # NB: initiator_iqn_list needs to be a comma separated list of initiator iqn strings self.logger.debug("Preparing to execute create()") timeout = Timeout(self.script_timeout) process = Popen(self.scriptfile_path + " -c -q" + " -u " + usernames + " -p " + passwords + " -s " + size + " -m " + target_lun + " -t " + target_iqn + " -i " + initiator_iqn_list, stdout=PIPE, shell=True) output = "Create operation exceeded execution timeout.\n" returncode = 1 timeout.start() try: output = process.communicate()[0] returncode = process.returncode except Timeout: process.kill() self.logger.warn( "Process %s servicing create() " + "exceeded execution timeout and was terminated.", process.pid) if process.returncode is not None: returncode = process.returncode finally: timeout.cancel() return [output, returncode]
def main(socket, address): global client_mgr print "one client", address logger.debug("one client %s" % str(address)) client = Client(socket) hbTimer = None while True: try: hbTimer = Timeout(ONE_MOVE_MAX_TIME) hbTimer.start() client.read_and_deal_cmd() hbTimer.cancel() except Timeout, t: if t == hbTimer: print "client lose" client.lose_hb() client.cancel_timeout() if client.latitude != None: client_mgr.remove_client(client) client = None break else: print "other timeout" hbTimer.cancel() client.deal_timeout() except:
def extractor(filename): interpreter = POPEN(binary + [filename] + append, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) timeout = Timeout(getattr(settings, 'EXTRACTION_TIMEOUT', 120), ChildTimeout) timeout.start() try: output, run_error = interpreter.communicate('') timeout.cancel() except ChildTimeout: print 'killing %s' % filename interpreter.kill() raise if (output_type == 'text' and not output.strip()) or ( output_type == 'html' and html_is_empty(output)) or ( error and (error in output or error in run_error)): raise ExtractionFailed() elif output_type == 'html': # strip non-breaking spaces return _nbsp.sub(' ', output) else: return output
def _recv_command(self): timeout = Timeout(self.command_timeout) timeout.start() try: return self.io.recv_command() finally: timeout.cancel()
def __init__(self, url): self.url = url self.protocol, self.domain = self.url.split( "://") #e.g. news.bbc.co.uk self.domain = self.domain.split('/')[0] self.site_data = sites[self.domain] self.total_words = {} timeout = Timeout(30, TimeoutError) timeout.start() try: self.html = self.read_url() except TimeoutError: print url + " timed out" return finally: timeout.cancel() self.text = self.boiler_extract() self.soup = BeautifulSoup(self.html, 'lxml') self.article = self.is_article() if self.article: self.calc_total_words() articles.put(self) self.find_links()
def func1(): utc = arrow.utcnow() local = utc.to('Asia/Shanghai') ts = local.timestamp print arrow.get(ts) #print local.format('YYYY-MM-DD HH:mm:ss ZZ') """function and heartbeat""" ex = TimeoutException("timeout ex") #gevent timeout timeout = Timeout(6, ex) #start timeout.start() try: # exception will be raised here, after *seconds* # passed since start() call gevent.sleep(3 * random.randint(1,4)) #print "f1 heart beat" heartbeat("f1") except TimeoutException as ex: print ex finally: #cancel timeout timeout.cancel()
def initial_test(self, address): try: timeout = Timeout(self.TIMEOUT, TestTimeout('The server timed out on the first command.')) timeout.start() TestClient(address).put('key', 'value') finally: timeout.cancel()
def _methodExecute(self, method_, id_="", timeout_=0, **args): # added the _ to make sure we have no conflicts with the code we execute timeoutObj = Timeout(timeout_) timeoutObj.start() if "lock_" in args: lock = args["lock_"] if lock != None: while not lock.checkCanExecute(id_): # print "sleep for lock:%s for methodid:%s"% (lock,id_) gevent.sleep(0.05) args.pop("lock_") try: result = method_(**args) except Exception as e: timeoutObj.cancel() self.methodError(id_, e) return None except Timeout as t: if t is not timeoutObj: raise RuntimeError("not my timeout") self.methodTimeout(id_) return None timeoutObj.cancel() if id_ in self.locksActive: self.locksActive.pop(id_) if lock != None: if id_ in lock.greenletsActive: lock.greenletsActive.pop(id_) # unlock the lock for this greenlet else: print "Could not find lock for id %s" % id_ return result
def timeout_wrapper(*args, **kwargs): t = Timeout(seconds, TestTimeout('Timed out after %d seconds' % seconds)) t.start() try: ret = func(*args, **kwargs) finally: t.cancel() return ret
def reposts_crawler(): ''' greenlet reposts crawler ''' while not reposts_fetch_queue.empty(): IS_NEED_REFETCH = False #when timeout or errors occur,put the url back into the task queue and the make sure the task is not set to done! try: wait_time = Timeout(MAX_WAIT_TIME) wait_time.start() url = reposts_fetch_queue.get() gevent.sleep(0.0) reposts_time = _http_call(url) for status in reposts_time['reposts']: if not status.get('deleted'): weibo_created_at = datetime.strptime(status.get('created_at'), '%a %b %d %H:%M:%S +0800 %Y') user_created_at = datetime.strptime(status.get('user').get('created_at'), '%a %b %d %H:%M:%S +0800 %Y') reposts_status_id = -1 if status.get('retweeted_status') is not None: reposts_status = status['retweeted_status'] reposts_status_id = reposts_status['id'] weibo_params = ( status['id'], status['user']['id'], status['text'], status['source'], weibo_created_at, reposts_status_id) user_params = ( status['user']['id'], status['user']['screen_name'], status['user']['name'], status['user']['province'], status['user']['city'], status['user']['location'], status['user']['description'], status['user']['profile_image_url'], status['user']['domain'], status['user']['gender'], status['user']['followers_count'], status['user']['friends_count'], status['user']['statuses_count'] , status['user']['favourites_count'], user_created_at, status['user']['verified'], status['user']['verified_type'], status['user']['verified_reason'], status['user']['bi_followers_count'] ) cursor.execute(REPOSTS_WEIBO_INSERT_SQL, weibo_params) cursor.execute(REPOSTS_USER_INSERT_SQL, user_params) except Timeout as t: if t is wait_time: # print '处理超时,等待重新抓取!' #put timeout url back into the task queue IS_NEED_REFETCH = True except Exception as e: IS_NEED_REFETCH = True logger.error(traceback.format_exc()) finally: wait_time.cancel() if IS_NEED_REFETCH is not True: reposts_fetch_queue.task_done() # print url + ' 抓取完成 --- 转发' else: reposts_fetch_queue.put(url) print status print url + ' 抓取失败 --- 转发'
def serve_for_test(self): timeout = Timeout(10) timeout.start() try: while self.is_connected(): if len(self.re_schedule_events) == 10 and len(self.heartbeat_events) == 10: break gevent.sleep(0.01) finally: timeout.cancel()
def timeout_wrapper(*args, **kwargs): t = Timeout(seconds, TestTimeout('Timed out after %d seconds' % seconds) ) t.start() try: ret = func(*args, **kwargs) finally: t.cancel() return ret
def serve_for_test(self): timeout = Timeout(10) timeout.start() try: while self.is_connected(): if len(self.re_schedule_events) == 10 and len( self.heartbeat_events) == 10: break gevent.sleep(0.01) finally: timeout.cancel()
def requestGet(self, url): wait = random.random() * (wait_time[1] - wait_time[0]) sleep(wait) timeout = Timeout(request_timeout) timeout.start() try: req = requests.get(url=url, verify=True, headers=headers, proxies=proxies) except IncompleteRead: pass # todo:未知错误,暂还未查清 timeout.cancel() return req
def query_documents_with_timeout(*args, **kwargs): timeout = Timeout(30) timeout.start() try: gevent.sleep(0.0001) return query_documents(*args, **kwargs) except: return [[], 0] finally: timeout.cancel()
def get_commits(): print('Start - {0}'.format(datetime.datetime.now())) timeout = Timeout(10) timeout.start() try: job_stack = [gevent.spawn(download(url)) for url in urls] gevent.joinall(job_stack) except Timeout: pass finally: timeout.cancel() cntx = OrderedDict(sorted(result.items())) return render_template('start.html', cntx=cntx) print('End - {0}'.format(datetime.datetime.now()))
def start(self): task = self.task_queue.get(block=False) # [target, (vid1, [name1, class1])] target = task[0] poc_vid = task[1][0] poc_name = task[1][1][0].split(".")[-1] poc = task[1][1][1]() poc.scan_info = { 'TaskId': self.task_id, 'Target': target, 'Verbose': self.verbose, 'Error': '', 'Mode': self.mode, 'Success': False, 'Ret': tree(), "risk_category": poc.scan_info.get('risk_category', '') } poc.poc_info["poc"]["Class"] = task[1][1][1].__name__ timeout = Timeout(self.fb.poc_setting.timeout) timeout.start() try: log.info("{} - {} start...".format(poc_vid, target)) poc.run(fb=self.fb) log.info("{} - {} finish.".format(poc_vid, target)) except Timeout: poc.scan_info['Error'] = "PoC run timeout." poc.scan_info['Success'] = False log.error("{} - {} error: PoC run timeout.".format( poc_vid, target)) except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: poc.scan_info['Error'] = str(e) poc.scan_info['Success'] = False log.error("{} - {} error: {}.".format(poc_vid, target, e)) except Exception: import traceback err = traceback.format_exc() poc.scan_info['Error'] = err poc.scan_info['Success'] = False log.error("{} - {} error: {}.".format(poc_vid, target, err)) finally: timeout.cancel() if not poc.scan_info.get("Success", False): return if self.fb.poc_setting.return_resp: poc.scan_info["req_resp"] = self._get_http_data(poc_vid, target) self.result.put_nowait([poc_name, poc.poc_info, poc.scan_info])
def generate(): result = None while result is None: try: timeout = Timeout(25) timeout.start() result = json.dumps(client.get_events( queue_id=queue_id, last_event_id=last_event_id)) logging.debug('got a response') except Timeout: pass finally: timeout.cancel() yield result or ' '
def read(self, nbytes): if self.timeout is None: timeout = None else: timeout = Timeout(self.timeout) timeout.start() try: buf = fd.read(self.fd, nbytes) except Timeout as e: if e is not timeout: raise raise TIMEOUT('Timeout reading from fd') else: if timeout is not None: timeout.cancel() return buf
def handle(self, body): t = int((self.timestamp + self.expiration) - time.time()) worker = self.get_worker(self.routing_key) log.debug("Running {0} with timeout {1} sec.".format(self.w_name, t)) timeout = Timeout(t, TimeoutError) timeout.start() try: res = worker(body) log.debug('Task finished.') return res except Exception as e: log.debug(traceback.format_exc()) log.error('Task error: {0}'.format(unicode(e))) return e finally: timeout.cancel()
class TimeoutMixin(object): """超时 """ def __init__(self, secs=None, exception=None, ref=True, priority=-1): self.secs = secs self.timeout = Timeout(secs) def stop_timeout(self): self.timeout.cancel() def reset_timeout(self): self.timeout.cancel() self.timeout = Timeout(self.secs, False) self.timeout.start() def start_timeout(self): self.timeout.start()
def _read_result(): timeout = Timeout(self._read_timeout, Timeout) timeout.start() try: result = self._read_result(cmd) result_channel.put(result) except Timeout: raise except: self.log.exception("read error in defer_command") result_channel.put((MemcacheResult.ERROR, error_value)) self.log.warn("Error communicating with Memcache %s, disconnecting", self._address) self.disconnect() finally: timeout.cancel()
def _write_command(): timeout = Timeout(self._write_timeout, Timeout) timeout.start() try: if not self.is_connected(): self.connect() self._write_command(cmd, args, True) self._read_queue.defer(_read_result) except Timeout: raise except: result_channel.put((MemcacheResult.ERROR, error_value)) self.log.warn("Error communicating with Memcache %s, disconnecting", self._address) self.disconnect() finally: timeout.cancel()
def _send(self, message, receiveTimeout=_RECEIVE_TIMEOUT): ''' Tries to send a message to the server and waits a fixed amount of seconds for it to answer. @param message: message to send to the server @type message: str @param receiveTimeout: max time in seconds in which the server should respond @type receiveTimeout: int ''' if not self._isConnected: self._connect() try: self._socket.send(message) except Exception as e: self._disconnect() printInDebugMode('Couldn\'t send message, unexpected exception' ' while sending to server: %s' % e) result = None timeout = Timeout(receiveTimeout) timeout.start() try: result = self._socket.recv() except Timeout as timeoutException: if timeoutException == timeout: printInDebugMode( 'Couldn\'t send message, server response timed' ' out after %d seconds' % receiveTimeout) self._disconnect() except Exception as e: printInDebugMode('Couldn\'t send message, unexpected exception' ' while receiving response from server: %s' % e) self._disconnect() finally: timeout.cancel() return result == '1'
def comments_crawler(): ''' greenlet comments crawler ''' while not comments_fetch_queue.empty(): IS_NEED_REFETCH = False #when timeout or errors occur,put the url back into the task queue and the make sure the task is not set to done! try: wait_time = Timeout(MAX_WAIT_TIME) wait_time.start() url = comments_fetch_queue.get() gevent.sleep() comments_time = _http_call(url) for status in comments_time['comments']: weibo_created_at = datetime.strptime(status.get('created_at'), '%a %b %d %H:%M:%S +0800 %Y') user_created_at = datetime.strptime(status.get('user').get('created_at'), '%a %b %d %H:%M:%S +0800 %Y') comments_status_id = -1 if status['status'] is not None: global retweeted_status_id comments_status = status['status'] comments_status_id = comments_status['id'] weibo_params = (status['id'], status['user']['id'], status['text'], status['source'], weibo_created_at, comments_status_id) user_params = (status['user']['id'], status['user']['screen_name'], status['user']['name'], status['user']['province'], status['user']['city'], status['user']['location'], status['user']['description'], status['user']['profile_image_url'], status['user']['domain'], status['user']['gender'], status['user']['followers_count'], status['user']['friends_count'], status['user']['statuses_count'], status['user']['favourites_count'], user_created_at, status['user']['verified'], status['user']['verified_type'], status['user']['verified_reason'], status['user']['bi_followers_count'] ) cursor.execute(COMMENTS_WEIBO_INSERT_SQL, weibo_params) cursor.execute(COMMENTS_USER_INSERT_SQL, user_params) except Timeout as t: if t is wait_time: # print '处理超时,等待重新抓取!' #put timeout url back into the task queue IS_NEED_REFETCH = True except Exception as e: print e.message print sys.exc_info() finally: wait_time.cancel() if IS_NEED_REFETCH is not True: comments_fetch_queue.task_done() # print url + ' 抓取完成 --- 评论' else: comments_fetch_queue.put(url)
def cancel_port_forward(self, address, port): """\ Cancel a port forwarding request. This cancellation request is sent to the server and on the server the port forwarding should be unregistered. @param address: remote server address @type address: C{str} @param port: remote port @type port: C{int} """ timeout = Timeout(10) timeout.start() try: self.ssh_transport.global_request('cancel-tcpip-forward', (address, port), wait=True) except: pass finally: timeout.cancel()
def _send(self, message, receiveTimeout=_RECEIVE_TIMEOUT): ''' Tries to send a message to the server and waits a fixed amount of seconds for it to answer. @param message: message to send to the server @type message: str @param receiveTimeout: max time in seconds in which the server should respond @type receiveTimeout: int ''' if not self._isConnected: self._connect() try: self._socket.send(message) except Exception as e: self._disconnect() printInDebugMode('Couldn\'t send message, unexpected exception' ' while sending to server: %s' % e) result = None timeout = Timeout(receiveTimeout) timeout.start() try: result = self._socket.recv() except Timeout as timeoutException: if timeoutException == timeout: printInDebugMode('Couldn\'t send message, server response timed' ' out after %d seconds' % receiveTimeout) self._disconnect() except Exception as e: printInDebugMode('Couldn\'t send message, unexpected exception' ' while receiving response from server: %s' % e) self._disconnect() finally: timeout.cancel() return result == '1'
def get_git_refs(): if DISABLE_NEW_EXTENSIONS: return 'Disabled', 403 git_endpoint = request.values.get('ep', None) if git_endpoint is None: return jsonify(error={'message': 'Missing endpoint'}), 400 if not git_endpoint.endswith('.git'): return jsonify(error={'message': 'Invalid git endpoint'}), 400 git_path = config.get('MINEMELD_GIT_PATH', None) if git_path is None: return jsonify(error={'message': 'MINEMELD_GIT_PATH not set'}), 500 git_args = [git_path, 'ls-remote', '-t', '-h', git_endpoint] git_process = Popen( args=git_args, close_fds=True, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) timeout = Timeout(20.0) timeout.start() try: git_stdout, git_stderr = git_process.communicate() except Timeout: git_process.kill() return jsonify(error={'message': 'Timeout accessing git repo'}), 400 finally: timeout.cancel() if git_process.returncode != 0: LOG.error('Error running {}: {}'.format(git_args, git_stderr)) return jsonify(error={'message': 'Error running git: {}'.format(git_stderr)}), 400 return jsonify(result=[line.rsplit('/', 1)[-1] for line in git_stdout.splitlines()])
def get_git_refs(): if DISABLE_NEW_EXTENSIONS: return 'Disabled', 403 git_endpoint = request.values.get('ep', None) if git_endpoint is None: return jsonify(error={'message': 'Missing endpoint'}), 400 if not git_endpoint.endswith('.git'): return jsonify(error={'message': 'Invalid git endpoint'}), 400 git_path = config.get('MINEMELD_GIT_PATH', None) if git_path is None: return jsonify(error={'message': 'MINEMELD_GIT_PATH not set'}), 500 git_args = [git_path, 'ls-remote', '-t', '-h', git_endpoint] git_process = Popen(args=git_args, close_fds=True, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) timeout = Timeout(20.0) timeout.start() try: git_stdout, git_stderr = git_process.communicate() except Timeout: git_process.kill() return jsonify(error={'message': 'Timeout accessing git repo'}), 400 finally: timeout.cancel() if git_process.returncode != 0: LOG.error('Error running {}: {}'.format(git_args, git_stderr)) return jsonify( error={'message': 'Error running git: {}'.format(git_stderr)}), 400 return jsonify( result=[line.rsplit('/', 1)[-1] for line in git_stdout.splitlines()])
def extractor(filename): interpreter = POPEN(binary + [filename] + append, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) timeout = Timeout(getattr(settings, 'EXTRACTION_TIMEOUT', 120), ChildTimeout) timeout.start() try: output, run_error = interpreter.communicate('') timeout.cancel() except ChildTimeout: print 'killing %s' % filename interpreter.kill() raise if (output_type == 'text' and not output.strip()) or (output_type == 'html' and html_is_empty(output)) or (error and (error in output or error in run_error)): raise ExtractionFailed() elif output_type == 'html': # strip non-breaking spaces return _nbsp.sub(' ', output) else: return output
def test(): import filecmp import shutil # Mock urlopen def mocked_urlopen(url): usplit = urlsplit(url) if 'test_site' not in usplit.hostname: sys.exit('Crawler tried to visit external host') url_path = usplit.path.lstrip('/') real_path = os.path.join('test_site', url_path) if os.path.isdir(real_path): real_path = os.path.join(real_path, 'index.html') if os.path.isfile(real_path): return open(real_path, 'rb') else: raise IOError('Error 404: Not Found') global urlopen urlopen = mocked_urlopen # Clean target directory if os.path.exists(TMPDIR): shutil.rmtree(TMPDIR) os.mkdir(TMPDIR) timeout = Timeout(15) try: timeout.start() Crawler('http://test_site', JOBS, TMPDIR).run() except Timeout: sys.exit('Test timeout') finally: timeout.cancel() # Compare dirs # Not very accurate measure if len(os.listdir('test_site/cats')) + len(os.listdir('test_site/cats2')) != len(os.listdir(TMPDIR)): sys.exit('Test failed. Number of cats differs'.format(file)) print 'Test OK.'
def write(self, buf): if self.timeout is None: timeout = None else: timeout = Timeout(self.timeout) timeout.start() buf = compat.buffer(buf) byteswritten = 0 try: while byteswritten != len(buf): nbytes = fd.write(self.fd, buf[byteswritten:]) assert nbytes != 0 byteswritten += nbytes except Timeout as e: if e is not timeout: raise raise TIMEOUT('Timeout writing to fd') else: if timeout is not None: timeout.cancel() return len(buf)
def delete(self, name): self.logger.debug("Preparing to execute delete()") timeout = Timeout(self.script_timeout) process = Popen(self.scriptfile_path + " -d -q" + " -n " + name, stdout=PIPE, shell=True) output = "Delete operation exceeded execution timeout.\n" returncode = 1 timeout.start() try: output = process.communicate()[0] returncode = process.returncode except Timeout: process.kill() self.logger.warn( "Process %s servicing delete() " + "exceeded execution timeout and was terminated.", process.pid) if process.returncode is not None: returncode = process.returncode finally: timeout.cancel() return [output, returncode]
def load_page(url): """Load page, recoder url which is new found, recoder url which is done """ timeout = Timeout(random.uniform(20,25)) timeout.start() try: r, status_code = http_get(url) if status_code == 302: add_url_done(url) return elif status_code != 200: logger.info('[url] %s [status_code] %s' % (url, status_code)) add_url_done(url) return data = r.text except Timeout, e: is_retry = move_doing2new(url) if not is_retry: logger.info('[url] %s [err_last_retry] %s' % (url, e)) else: logger.info('[url] %s [err_retry] %s' % (url, e)) timeout.cancel() return
def method(*args, **kwds): # 从连接池获取连接 client = self.get_client_from_pool() # 连接池中无连接 if client is None: # 设置获取连接的超时时间 time_out = Timeout(self.get_connection_timeout) time_out.start() try: async_result = AsyncResult() self.no_client_queue.appendleft(async_result) client = async_result.get() # blocking except: with self.lock: if client is None: self.no_client_queue.remove(async_result) self.logger.error("Get Connection Timeout!") finally: time_out.cancel() if client is not None: for i in xrange(self.max_renew_times): try: put_back_flag = True client.last_use_time = time.time() fun = getattr(client, name, None) return fun(*args, **kwds) except socket.timeout: self.logger.error("Socket Timeout!") # 关闭连接,不关闭会导致乱序 put_back_flag = False self.close_one_client(client) break except TTransportException, e: put_back_flag = False if e.type == TTransportException.END_OF_FILE: self.logger.warning("Socket Connection Reset Error,%s", e) with self.lock: client.close() self.pool_size -= 1 client = self.get_new_client() else: self.logger.error("Socket Error,%s", e) self.close_one_client(client) break except socket.error, e: put_back_flag = False if e.errno == socket.errno.ECONNABORTED: self.logger.warning("Socket Connection aborted Error,%s", e) with self.lock: client.close() self.pool_size -= 1 client = self.get_new_client() else: self.logger.error("Socket Error, %s", e) self.close_one_client(client) break except Exception as e: put_back_flag = False self.logger.error("Thrift Error, %s", e) self.close_one_client(client) break
def get_Files(self, request): request.setResponseCode(200) request.setHeader('Content-Type', 'text/event-stream') request.setHeader('Expires', 'Fri, 01 Jan 1990 00:00:00 GMT') request.setHeader('Cache-Control', 'no-cache, no-store, max-age=0, must-revalidate') request.setHeader('Pragma', 'no-cache') request.setHeader('Access-Control-Allow-Origin', '*') retry = 1000 # retry time in miliseconds wait = 15 if request.env.has_key('HTTP_X_SUBSCRIPTION_WAIT'): try: wait = int(request.env['HTTP_X_SUBSCRIPTION_WAIT']) except Exception: pass # @todo: Figure out if the connection should be closed or not close = True eventQueue = Event() userObject = self.getUser(request) if not userObject: return lastId = None try: if request.env.has_key('HTTP_LAST_EVENT_ID'): lastId = request.env['HTTP_LAST_EVENT_ID'] elif request.params.has_key('Last-Event-ID'): lastId = request.params['Last-Event-ID'] if lastId is not None: lastId = "%.4f" % lastId except Exception: pass channel = "%s.files" % userObject.get('id') events = eventQueue.subscribe(channel, lastId) timeout = None try: """ @warninig: If the connection is closed from haproxy for some reason this connection is not recycled. That's why we close the connection after 15 seconds (the haproxy timeout is set to 20 seconds). @todo: Figure out why the closing on the other side is not detected and the connection hangs @todo: Figure out how to set a timeout that sends noop command every 15 seconds and keeps the connection alive for ever. """ if close: timeout = Timeout(wait).start() for event in events: if request.env.has_key('BASE_URI'): event = re.sub( r'"resource":\s*"(.*?)"', '"resource": "%s\\1"' % request.env['BASE_URI'], event) if request.env.has_key('URI_REPLACE'): match = re.search(r'"resource":\s*"(.*?)"', event) if match: uri = match.group(1).replace( request.env['URI_REPLACE'][0], request.env['URI_REPLACE'][1]) event = re.sub(r'"resource":\s*"(.*?)"', '"resource": "%s"' % uri, event) text = ": %s\n" % ''.center(2049, ' ') # 2kb padding for IE text += "data: {\"files\": %s}\n" % event match = re.search(r'"time":\s*([0-9.]+)', event) if match: text += "id: %s\n\n" % match.group(1) yield text if not close: continue # if we have to close then reset the timeout if timeout is not None: timeout.clear() timeout = Timeout(wait).start() yield ":noop\n" except Timeout: yield "retry: %d" % retry finally: if hasattr(events, 'close') and callable(events.close): events.close() if timeout is not None: timeout.cancel()
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") def __init__(self, concurrent_num=10, crawl_tags=[], custom_headers={}, plugin=['send_url_to_celery'], depth=10, max_url_num=3000000, internal_timeout=20, spider_timeout=6 * 3600, crawler_mode=1, same_origin=True, dynamic_parse=True): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode #爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue( maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue( maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = [ 'a', 'base', 'iframe', 'frame', 'object', 'framset' ] self.ignore_ext = [ 'cab', 'ico', 'swf', 'rar', 'zip', 'tar', 'gz', 'js', '7z', 'bz2', 'iso', 'nrg', 'uif', 'exe', 'rpm', 'deb', 'dmg', 'jar', 'jad', 'bin', 'apk', 'run', 'msi', 'xls', 'xlsx', 'ppt', 'pptx', 'pdf', 'doc', 'docx', 'odf', 'rtf', 'odt', 'mkv', 'avi', 'mp4', 'flv', 'WebM', 'mov', 'wmv', '3gp', 'mpg', 'mpeg', 'mp3', 'wav', 'ss3', 'ogg', 'mp4a', 'wma', 'png', 'jpeg', 'jpg', 'xpm', 'gif', 'tiff', 'css', 'bmp', 'svg', 'exif', 'thmx', 'xml', 'txt' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin #注册Crawler中使用的插件 self.custom_headers = {'User-Agent': random.choice(USER_AGENTS)} def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() #等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len( self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() #维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 link_generator = HtmlAnalyzer.extract_links( url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue # 增加url相似性判断,详见urlfilter.py if not self.check_url_similar(link): continue # 增加url重复判断,详见urlfilter.py if not self.check_url_repeat(link): continue if curr_depth > self.depth: #最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self. fetcher_cache) == self.max_url_num: #最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) for plugin_name in self.plugin_handler: #循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()
def run(self): """ While determined to be running, run loop is: - check if total time has been violated - if max time has been validated log and put in finished state - wait X seconds for next state - yield :return: """ # apply the first state so we can follow event loop flow self.event_result_q.put_nowait(self.machine.events.first_state()) logger.debug( '%s', { 'message': 'sending_first_state', 'first_state': self.machine.events.first_state() }) timeout = Timeout(self.machine.max_timeout) timeout.start() try: while self.machine.is_running(): # how do we support a general overarching timeout # and a specific one for the current running event try: # we can ignore the next state, this is only used to indicate # when it's time to apply a transition result = self.event_result_q.get() except gevent.queue.Empty: logger.debug('%s', { 'message': 'queue_empty', }) else: if result == EVENT_RESULT.FAILURE: logger.debug('%s', {'message': 'task_failure'}) return False logger.debug('%s', { 'message': 'state_change_requested', }) self.machine.events.teardown_current() self.machine.next_state() if self.machine.state == STATES.FINISHED: logger.debug( '%s', { 'message': 'task_execution_finished', 'status': 'SUCCESS', }) return True self.machine.run_current_event( event_result_q=self.event_result_q) except Timeout: logger.error( '%s', { 'message': 'task timeout reached', 'timeout': self.machine.max_timeout, 'units': 'seconds' }) return False finally: timeout.cancel() return True
#!/usr/bin/env python # encoding: utf-8 import gevent from gevent import Timeout seconds = 10 timeout = Timeout(seconds) timeout.start() def wait(): gevent.sleep(10) try: gevent.spawn(wait).join() except Timeout: print('Could not complete') finally: timeout.cancel()
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") def __init__(self, concurrent_num=20, crawl_tags=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode #爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() #等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() #维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 link_generator = HtmlAnalyzer.extract_links( url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: #最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache ) == self.max_url_num: #最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) self.crawler_queue.task_done()
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=10, max_url_num=3000, internal_timeout=60, spider_timeout=1800, dir_max_url=15, crawler_mode=0, same_origin=True, dynamic_parse=False, login_dict={}, scan_task_id=0): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.spider_stop_time = time() + spider_timeout self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = [ 'script', 'a', 'base', 'iframe', 'frame', 'object' ] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip', 'swf', 'ico' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dir_max_url = dir_max_url self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit(login_dict) if login_dict: self.webkit.auto_login() # elif custom_headers.get('Cookie'): # # self.webkit.set_cookie(custom_headers) self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 self.custom_headers = custom_headers self.scan_task_id = scan_task_id def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): crawler = Crawler(self) self.crawler_pool.spawn(crawler._run()) def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") self._start_fetcher() # sleep(60) self._start_crawler() print 2222222222222 self.stopped.wait() # 等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def feed_url(self, url): ''' 设置初始爬取URL ''' if isinstance(url, basestring): url = to_unicode(url) url = UrlData(url) if self.same_origin: url_part = urlparse.urlparse(unicode(url)) self.origin = (url_part.scheme, url_part.netloc) self.fetcher_queue.put(url, block=True) def stop(self): ''' 终止爬虫 ''' self.stopped.set()
def get_Files(self, request): request.setResponseCode(200) request.setHeader('Content-Type', 'text/event-stream') request.setHeader('Expires', 'Fri, 01 Jan 1990 00:00:00 GMT') request.setHeader('Cache-Control', 'no-cache, no-store, max-age=0, must-revalidate') request.setHeader('Pragma','no-cache') request.setHeader('Access-Control-Allow-Origin', '*') retry = 1000 # retry time in miliseconds wait = 15 if request.env.has_key('HTTP_X_SUBSCRIPTION_WAIT'): try: wait = int(request.env['HTTP_X_SUBSCRIPTION_WAIT']) except Exception: pass # @todo: Figure out if the connection should be closed or not close = True eventQueue = Event() userObject = self.getUser(request) if not userObject: return lastId = None try: if request.env.has_key('HTTP_LAST_EVENT_ID'): lastId = request.env['HTTP_LAST_EVENT_ID'] elif request.params.has_key('Last-Event-ID'): lastId = request.params['Last-Event-ID'] if lastId is not None: lastId = "%.4f" % lastId except Exception: pass channel = "%s.files" % userObject.get('id') events = eventQueue.subscribe(channel, lastId) timeout = None try: """ @warninig: If the connection is closed from haproxy for some reason this connection is not recycled. That's why we close the connection after 15 seconds (the haproxy timeout is set to 20 seconds). @todo: Figure out why the closing on the other side is not detected and the connection hangs @todo: Figure out how to set a timeout that sends noop command every 15 seconds and keeps the connection alive for ever. """ if close: timeout = Timeout(wait).start() for event in events: if request.env.has_key('BASE_URI'): event = re.sub(r'"resource":\s*"(.*?)"', '"resource": "%s\\1"' % request.env['BASE_URI'], event) if request.env.has_key('URI_REPLACE'): match = re.search(r'"resource":\s*"(.*?)"', event) if match: uri = match.group(1).replace(request.env['URI_REPLACE'][0],request.env['URI_REPLACE'][1]) event = re.sub(r'"resource":\s*"(.*?)"', '"resource": "%s"' % uri, event) text = ": %s\n" % ''.center(2049,' ') # 2kb padding for IE text += "data: {\"files\": %s}\n" % event match = re.search(r'"time":\s*([0-9.]+)',event) if match: text += "id: %s\n\n" % match.group(1) yield text if not close: continue # if we have to close then reset the timeout if timeout is not None: timeout.clear() timeout = Timeout(wait).start() yield ":noop\n" except Timeout: yield "retry: %d" % retry finally: if hasattr(events, 'close') and callable(events.close): events.close() if timeout is not None: timeout.cancel()
def send(self, data, max_bytes=None, retry_wait=0.25): """ Socket Wrapper for people using this class as if it were just a python socket class; always ignore EINTR flags to finish sending the data """ # track bytes written tot_bytes = 0 # Get reference time cur_time = datetime.now() # Current elapsed time elapsed_time = 0.0 if not max_bytes: max_bytes = len(data) while self.connected and (max_bytes - tot_bytes) > 0: # This timer is used to determine how long we should # wait before assuming we can't send content and that we # have connection problems. stale_timeout = max(((max_bytes - tot_bytes) / 10800.0), 15.0) if not self.can_write(stale_timeout): # can't write down pipe; something has gone wrong self.close() raise SocketException('Connection write wait timeout') # bump 10 seconds onto our timeout stale_timeout += 10 # Initialize our stale timeout timer stale_timer = Timeout(stale_timeout) # Update Elapsed Time elapsed_time = datetime.now() - cur_time elapsed_time = (elapsed_time.days * 86400) \ + elapsed_time.seconds + (elapsed_time.microseconds / 1e6) try: # Start stale_timer stale_timer.start() # Send data bytes_sent = self.socket.send( data[tot_bytes:tot_bytes + max_bytes], ) stale_timer.cancel() # Get our elapsed transfer time elapsed_xfer_time = datetime.now() - cur_time elapsed_xfer_time = ( (elapsed_xfer_time.days * 86400) + elapsed_xfer_time.seconds + (elapsed_xfer_time.microseconds / 1e6)) - elapsed_time if not bytes_sent: self.close() raise SocketException('Connection lost') # Handle content received tot_bytes += bytes_sent # Call write hook self.hooks.call( 'socket_write', # Host information host=self._remote_addr, port=self._remote_port, # The number of bytes read xfer_bytes=bytes_sent, # The time it took to read these bytes xfer_time=elapsed_xfer_time, # Our sockets socket=weakref.proxy(self), ) except Timeout: # Timeout occurred; Sleep for a little bit sleep(retry_wait) if self.can_write() is None: self.close() raise SocketException('Connection broken due to timeout') except socket.error, e: if e[0] == errno.EAGAIN: # Timeout occurred; Sleep for a little bit sleep(retry_wait) if self.can_write() is None: self.close() raise SocketException('Connection broken') elif e[0] == errno.EINTR: # A Signal was caught, resend this data before # raising the signal higher. signals can wait when # there is data flowing raise SignalCaughtException('Signal received') else: # errno.EPIPE (Broken Pipe) usually at this point self.close() raise SocketException('Connection lost') except Exception, e: self.close() raise
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") # 相当于构造函数 def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ # 日志模块 self.logger.setLevel(logging.DEBUG) # 日志级别 formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") # 日志格式 hd = logging.StreamHandler() hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout # 内部调用超时时间 self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num # 并行crawler与fetcher数量 # fetcher使用gevent模型 self.fetcher_pool = pool.Pool(self.concurrent_num) # crawler模型设置 # crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler if self.crawler_mode == 0: # 线程池模型 self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: # gevent模型 self.crawler_pool = pool.Pool(self.concurrent_num) # fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接 # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] # 默认的爬行时收集URL所属标签列表 self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] # 爬行时忽略的URL种类 self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 爬行时收集URL所属标签列表 self.same_origin = same_origin # 是否同源 self.depth = depth # 爬行深度限制 self.max_url_num = max_url_num # 最大收集URL数量 self.dynamic_parse = dynamic_parse # 是否使用WebKit动态解析 # 如果开启动态解析 if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 # 自定义HTTP头 self.custom_headers = custom_headers def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): # concurrent_num:并行数量 fetcher = Fetcher(self) # 实例化一个 fetcher self.fetcher_pool.start(fetcher) # 调用start()开始执行_run() def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) # 启动self.crawler()函数 def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") # 开启fetcher与crawler self._start_fetcher() # 初始爬取URL已在main函数中spider.feed_url(url)中设置 self._start_crawler() self.stopped.wait() # 等待停止事件置位 # 等待fetcher与crawler执行结束 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() # 维护爬虫池 url_data = self.crawler_queue.get( block=False) # 从爬取队列取出一个URL except queue.Empty, e: # 队列为空 if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: # 全部处理完毕 self.stop() else: # fetcher没有处理完毕 if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 # 当前深度 # 生成URL list link_generator = HtmlAnalyzer.extract_links( url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: # WebKit动态解析 link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) # 去重 # 遍历解析出的URL list for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: # 最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache ) == self.max_url_num: # 最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) # 此处调整顺序,应该先加入下载队列 self.fetcher_queue.put(url, block=True) self.fetcher_cache.insert(url) # 加入到已经处理fetcher队列 # 插件部分,暂时不关注 for plugin_name in self.plugin_handler: # 循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()