예제 #1
0
    def send_api_request(self, init_params):
        """Send request to api.
        
        :param dict init_params: Request params
        :return: http request response
        :rtype: str
        :raises ApiError: raise :class:`ApiError` if there are some error during connection.
        """
        # gevent based api request
        if self._gevent_async:
            timeout = Timeout(self._timeout)
            timeout.start()
            try:
                #self.logger.debug('START')
                res = self._send_api_request(init_params)
                #self.logger.debug('STOP')
            except Timeout:
                err = 'Cloudstack api call timeout after : %ss' % self._timeout
                self.logger.error(err)
                raise ApiError(err)
            except ApiError:
                raise
            finally:
                timeout.cancel()
        # blocking api request
        else:
            res = self._send_api_request(init_params, timeout=self._timeout)

        return res
예제 #2
0
    def _get_message_data(self):
        max_size = self.extensions.getparam('SIZE', filter=int)
        reader = DataReader(self.io, max_size)

        err = None
        timeout = Timeout(self.data_timeout)
        timeout.start()
        try:
            data = reader.recv()
        except ConnectionLost:
            raise
        except SmtpError as e:
            data = None
            err = e
        finally:
            timeout.cancel()

        reply = Reply('250', '2.6.0 Message Accepted for Delivery')
        self._call_custom_handler('HAVE_DATA', reply, data, err)

        self.io.send_reply(reply)
        self.io.flush_send()

        self.have_mailfrom = None
        self.have_rcptto = None
예제 #3
0
파일: spider.py 프로젝트: Timewire/timewire
    def __init__(self, url):
        self.url = url
        self.protocol, self.domain = self.url.split("://")  #e.g. news.bbc.co.uk
        self.domain = self.domain.split('/')[0]
        self.site_data = sites[self.domain]

        self.total_words = {}

        timeout = Timeout(30, TimeoutError)
        timeout.start()
        try:
            self.html = self.read_url()
        except TimeoutError:
            print url + " timed out"
            return 
        finally:
            timeout.cancel()

        self.text = self.boiler_extract()
        self.soup = BeautifulSoup(self.html, 'lxml')
        self.article = self.is_article()

        if self.article:
            self.calc_total_words()
            articles.put(self)

        self.find_links()
예제 #4
0
    def create(self, usernames, passwords, target_iqn, target_lun, size,
               initiator_iqn_list):
        # NB: initiator_iqn_list needs to be a comma separated list of initiator iqn strings
        self.logger.debug("Preparing to execute create()")
        timeout = Timeout(self.script_timeout)
        process = Popen(self.scriptfile_path + " -c -q" + " -u " + usernames +
                        " -p " + passwords + " -s " + size + " -m " +
                        target_lun + " -t " + target_iqn + " -i " +
                        initiator_iqn_list,
                        stdout=PIPE,
                        shell=True)

        output = "Create operation exceeded execution timeout.\n"
        returncode = 1
        timeout.start()
        try:
            output = process.communicate()[0]
            returncode = process.returncode
        except Timeout:
            process.kill()
            self.logger.warn(
                "Process %s servicing create() " +
                "exceeded execution timeout and was terminated.", process.pid)
            if process.returncode is not None:
                returncode = process.returncode
        finally:
            timeout.cancel()
        return [output, returncode]
예제 #5
0
def main(socket, address):
    global client_mgr
    print "one client", address
    logger.debug("one client %s" % str(address))
    client = Client(socket)
    hbTimer = None
    while True:
        try:
            hbTimer = Timeout(ONE_MOVE_MAX_TIME)
            hbTimer.start()
            client.read_and_deal_cmd()
            hbTimer.cancel()
        except Timeout, t:
            if t == hbTimer:
                print "client lose"
                client.lose_hb()
                client.cancel_timeout()
                if client.latitude != None:
                    client_mgr.remove_client(client)
                client = None
                break
            else:
                print "other timeout"
                hbTimer.cancel()
                client.deal_timeout()

        except:
예제 #6
0
    def extractor(filename):
        interpreter = POPEN(binary + [filename] + append,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

        timeout = Timeout(getattr(settings, 'EXTRACTION_TIMEOUT', 120),
                          ChildTimeout)
        timeout.start()
        try:
            output, run_error = interpreter.communicate('')
            timeout.cancel()
        except ChildTimeout:
            print 'killing %s' % filename
            interpreter.kill()
            raise

        if (output_type == 'text' and not output.strip()) or (
                output_type == 'html' and html_is_empty(output)) or (
                    error and (error in output or error in run_error)):
            raise ExtractionFailed()
        elif output_type == 'html':
            # strip non-breaking spaces
            return _nbsp.sub(' ', output)
        else:
            return output
예제 #7
0
 def _recv_command(self):
     timeout = Timeout(self.command_timeout)
     timeout.start()
     try:
         return self.io.recv_command()
     finally:
         timeout.cancel()
예제 #8
0
파일: spider.py 프로젝트: Timewire/timewire
    def __init__(self, url):
        self.url = url
        self.protocol, self.domain = self.url.split(
            "://")  #e.g. news.bbc.co.uk
        self.domain = self.domain.split('/')[0]
        self.site_data = sites[self.domain]

        self.total_words = {}

        timeout = Timeout(30, TimeoutError)
        timeout.start()
        try:
            self.html = self.read_url()
        except TimeoutError:
            print url + " timed out"
            return
        finally:
            timeout.cancel()

        self.text = self.boiler_extract()
        self.soup = BeautifulSoup(self.html, 'lxml')
        self.article = self.is_article()

        if self.article:
            self.calc_total_words()
            articles.put(self)

        self.find_links()
예제 #9
0
def func1():
    
        
    utc = arrow.utcnow()
    local = utc.to('Asia/Shanghai')
    ts = local.timestamp
    print arrow.get(ts)
    #print local.format('YYYY-MM-DD HH:mm:ss ZZ')
    
    """function and heartbeat"""
    
    ex = TimeoutException("timeout ex")
    
    #gevent timeout
    timeout = Timeout(6, ex)
    #start
    timeout.start()
    try:
        
        # exception will be raised here, after *seconds* 
        # passed since start() call
        
        gevent.sleep(3 * random.randint(1,4))
        #print "f1 heart beat"
        heartbeat("f1")

    except TimeoutException as ex:
        print ex
    finally:
        #cancel timeout
        timeout.cancel()
예제 #10
0
 def initial_test(self, address):
     try:
         timeout = Timeout(self.TIMEOUT, TestTimeout('The server timed out on the first command.'))
         timeout.start()
         TestClient(address).put('key', 'value')
     finally:
         timeout.cancel()
예제 #11
0
    def _methodExecute(self, method_, id_="", timeout_=0, **args):  # added the _ to make sure we have no conflicts with the code we execute
        timeoutObj = Timeout(timeout_)
        timeoutObj.start()

        if "lock_" in args:
            lock = args["lock_"]
            if lock != None:
                while not lock.checkCanExecute(id_):
                    # print "sleep for lock:%s for methodid:%s"% (lock,id_)
                    gevent.sleep(0.05)
            args.pop("lock_")

        try:
            result = method_(**args)
        except Exception as e:
            timeoutObj.cancel()
            self.methodError(id_, e)
            return None
        except Timeout as t:
            if t is not timeoutObj:
                raise RuntimeError("not my timeout")
            self.methodTimeout(id_)
            return None
        timeoutObj.cancel()
        if id_ in self.locksActive:
            self.locksActive.pop(id_)
            if lock != None:
                if id_ in lock.greenletsActive:
                    lock.greenletsActive.pop(id_)  # unlock the lock for this greenlet
            else:
                print "Could not find lock for id %s" % id_

        return result
예제 #12
0
 def timeout_wrapper(*args, **kwargs):
     t = Timeout(seconds,
                 TestTimeout('Timed out after %d seconds' % seconds))
     t.start()
     try:
         ret = func(*args, **kwargs)
     finally:
         t.cancel()
     return ret
예제 #13
0
def reposts_crawler():
    '''
    greenlet reposts crawler
    '''
    while not reposts_fetch_queue.empty():
        IS_NEED_REFETCH = False #when timeout or errors occur,put the url back into the task queue and the make sure the task is not set to done!
        try:
            wait_time = Timeout(MAX_WAIT_TIME)
            wait_time.start()
            url = reposts_fetch_queue.get()
            gevent.sleep(0.0)
            reposts_time = _http_call(url)
            for status in reposts_time['reposts']:
                if not status.get('deleted'):
                    weibo_created_at = datetime.strptime(status.get('created_at'), '%a %b %d %H:%M:%S +0800 %Y')
                    user_created_at = datetime.strptime(status.get('user').get('created_at'),
                                                        '%a %b %d %H:%M:%S +0800 %Y')
                    reposts_status_id = -1

                    if status.get('retweeted_status') is not None:
                        reposts_status = status['retweeted_status']
                        reposts_status_id = reposts_status['id']

                    weibo_params = (
                        status['id'], status['user']['id'], status['text'], status['source'], weibo_created_at,
                        reposts_status_id)
                    user_params = (
                        status['user']['id'], status['user']['screen_name'], status['user']['name'],
                        status['user']['province'],
                        status['user']['city'], status['user']['location'], status['user']['description'],
                        status['user']['profile_image_url'], status['user']['domain'], status['user']['gender'],
                        status['user']['followers_count'], status['user']['friends_count'],
                        status['user']['statuses_count']
                        ,
                        status['user']['favourites_count'], user_created_at, status['user']['verified'],
                        status['user']['verified_type'], status['user']['verified_reason'],
                        status['user']['bi_followers_count'] )
                    cursor.execute(REPOSTS_WEIBO_INSERT_SQL, weibo_params)
                    cursor.execute(REPOSTS_USER_INSERT_SQL, user_params)
        except Timeout as t:
            if t is wait_time:
            #                print '处理超时,等待重新抓取!'
                #put timeout url back into the task queue
                IS_NEED_REFETCH = True
        except Exception as e:
            IS_NEED_REFETCH = True
            logger.error(traceback.format_exc())
        finally:
            wait_time.cancel()
            if IS_NEED_REFETCH is not True:
                reposts_fetch_queue.task_done()
            #                print url + ' 抓取完成 --- 转发'
            else:
                reposts_fetch_queue.put(url)
                print status
                print url + ' 抓取失败 --- 转发'
 def serve_for_test(self):
     timeout = Timeout(10)
     timeout.start()
     try:
         while self.is_connected():
             if len(self.re_schedule_events) == 10 and len(self.heartbeat_events) == 10:
                 break
             gevent.sleep(0.01)
     finally:
         timeout.cancel()
예제 #15
0
 def timeout_wrapper(*args, **kwargs):
     t = Timeout(seconds,
         TestTimeout('Timed out after %d seconds' % seconds)
     )
     t.start()
     try:
         ret = func(*args, **kwargs)
     finally:
         t.cancel()
     return ret
예제 #16
0
 def serve_for_test(self):
     timeout = Timeout(10)
     timeout.start()
     try:
         while self.is_connected():
             if len(self.re_schedule_events) == 10 and len(
                     self.heartbeat_events) == 10:
                 break
             gevent.sleep(0.01)
     finally:
         timeout.cancel()
예제 #17
0
파일: main.py 프로젝트: cyhhao/CrawlerImage
 def requestGet(self, url):
     wait = random.random() * (wait_time[1] - wait_time[0])
     sleep(wait)
     timeout = Timeout(request_timeout)
     timeout.start()
     try:
         req = requests.get(url=url, verify=True, headers=headers, proxies=proxies)
     except IncompleteRead:
         pass
         # todo:未知错误,暂还未查清
     timeout.cancel()
     return req
예제 #18
0
def query_documents_with_timeout(*args, **kwargs):
    timeout = Timeout(30)
    timeout.start()

    try:
        gevent.sleep(0.0001)
        return query_documents(*args, **kwargs)

    except:
        return [[], 0]

    finally:
        timeout.cancel()
예제 #19
0
def query_documents_with_timeout(*args, **kwargs):
    timeout = Timeout(30)
    timeout.start()

    try:
        gevent.sleep(0.0001)
        return query_documents(*args, **kwargs)

    except:
        return [[], 0]

    finally:
        timeout.cancel()
예제 #20
0
def get_commits():
    print('Start - {0}'.format(datetime.datetime.now()))
    timeout = Timeout(10)
    timeout.start()
    try:
        job_stack = [gevent.spawn(download(url)) for url in urls]
        gevent.joinall(job_stack)
    except Timeout:
        pass
    finally:
        timeout.cancel()
    cntx = OrderedDict(sorted(result.items()))
    return render_template('start.html', cntx=cntx)
    print('End - {0}'.format(datetime.datetime.now()))
예제 #21
0
파일: RunPoc.py 프로젝트: zodiacyann/osprey
    def start(self):
        task = self.task_queue.get(block=False)
        # [target, (vid1, [name1, class1])]
        target = task[0]
        poc_vid = task[1][0]
        poc_name = task[1][1][0].split(".")[-1]
        poc = task[1][1][1]()

        poc.scan_info = {
            'TaskId': self.task_id,
            'Target': target,
            'Verbose': self.verbose,
            'Error': '',
            'Mode': self.mode,
            'Success': False,
            'Ret': tree(),
            "risk_category": poc.scan_info.get('risk_category', '')
        }
        poc.poc_info["poc"]["Class"] = task[1][1][1].__name__

        timeout = Timeout(self.fb.poc_setting.timeout)
        timeout.start()
        try:
            log.info("{} - {} start...".format(poc_vid, target))
            poc.run(fb=self.fb)
            log.info("{} - {} finish.".format(poc_vid, target))
        except Timeout:
            poc.scan_info['Error'] = "PoC run timeout."
            poc.scan_info['Success'] = False
            log.error("{} - {} error: PoC run timeout.".format(
                poc_vid, target))
        except (requests.exceptions.Timeout,
                requests.exceptions.ConnectionError) as e:
            poc.scan_info['Error'] = str(e)
            poc.scan_info['Success'] = False
            log.error("{} - {} error: {}.".format(poc_vid, target, e))
        except Exception:
            import traceback
            err = traceback.format_exc()
            poc.scan_info['Error'] = err
            poc.scan_info['Success'] = False
            log.error("{} - {} error: {}.".format(poc_vid, target, err))
        finally:
            timeout.cancel()
        if not poc.scan_info.get("Success", False):
            return
        if self.fb.poc_setting.return_resp:
            poc.scan_info["req_resp"] = self._get_http_data(poc_vid, target)
        self.result.put_nowait([poc_name, poc.poc_info, poc.scan_info])
예제 #22
0
 def requestGet(self, url):
     wait = random.random() * (wait_time[1] - wait_time[0])
     sleep(wait)
     timeout = Timeout(request_timeout)
     timeout.start()
     try:
         req = requests.get(url=url,
                            verify=True,
                            headers=headers,
                            proxies=proxies)
     except IncompleteRead:
         pass
         # todo:未知错误,暂还未查清
     timeout.cancel()
     return req
예제 #23
0
	def generate():
		result = None
		while result is None:
			try:
				timeout = Timeout(25)
				timeout.start()
				result = json.dumps(client.get_events(
					queue_id=queue_id,
					last_event_id=last_event_id))
				logging.debug('got a response')
			except Timeout:
				pass
			finally:
				timeout.cancel()
			yield result or ' '
예제 #24
0
 def read(self, nbytes):
     if self.timeout is None:
         timeout = None
     else:
         timeout = Timeout(self.timeout)
         timeout.start()
     try:
         buf = fd.read(self.fd, nbytes)
     except Timeout as e:
         if e is not timeout:
             raise
         raise TIMEOUT('Timeout reading from fd')
     else:
         if timeout is not None:
             timeout.cancel()
     return buf
예제 #25
0
파일: listener.py 프로젝트: drozdovsky/crew
 def handle(self, body):
     t = int((self.timestamp + self.expiration) - time.time())
     worker = self.get_worker(self.routing_key)
     log.debug("Running {0} with timeout {1} sec.".format(self.w_name, t))
     timeout = Timeout(t, TimeoutError)
     timeout.start()
     try:
         res = worker(body)
         log.debug('Task finished.')
         return res
     except Exception as e:
         log.debug(traceback.format_exc())
         log.error('Task error: {0}'.format(unicode(e)))
         return e
     finally:
         timeout.cancel()
예제 #26
0
파일: __init__.py 프로젝트: hycxa/dggss
class TimeoutMixin(object):
    """超时
    """
    def __init__(self, secs=None, exception=None, ref=True, priority=-1):
        self.secs = secs
        self.timeout = Timeout(secs)
        
    def stop_timeout(self):
        self.timeout.cancel()
        
    def reset_timeout(self):
        self.timeout.cancel()
        self.timeout = Timeout(self.secs, False)
        self.timeout.start()
        
    def start_timeout(self):
        self.timeout.start()
예제 #27
0
        def _read_result():

            timeout = Timeout(self._read_timeout, Timeout)
            timeout.start()
            try:
                result = self._read_result(cmd)
                result_channel.put(result)
            except Timeout:
                raise
            except:
                self.log.exception("read error in defer_command")
                result_channel.put((MemcacheResult.ERROR, error_value))

                self.log.warn("Error communicating with Memcache %s, disconnecting", self._address)
                self.disconnect()
            finally:
                timeout.cancel()
예제 #28
0
        def _write_command():

            timeout = Timeout(self._write_timeout, Timeout)
            timeout.start()
            try:
                if not self.is_connected():
                    self.connect()
                self._write_command(cmd, args, True)
                self._read_queue.defer(_read_result)
            except Timeout:
                raise
            except:
                result_channel.put((MemcacheResult.ERROR, error_value))

                self.log.warn("Error communicating with Memcache %s, disconnecting", self._address)
                self.disconnect()
            finally:
                timeout.cancel()
예제 #29
0
    def _send(self, message, receiveTimeout=_RECEIVE_TIMEOUT):
        '''
        Tries to send a message to the server and waits a fixed amount of
        seconds for it to answer.

        @param message: message to send to the server
        @type message: str

        @param receiveTimeout: max time in seconds in which the server should respond
        @type receiveTimeout: int
        '''

        if not self._isConnected:
            self._connect()

        try:
            self._socket.send(message)
        except Exception as e:
            self._disconnect()
            printInDebugMode('Couldn\'t send message, unexpected exception'
                             ' while sending to server: %s' % e)

        result = None

        timeout = Timeout(receiveTimeout)
        timeout.start()

        try:
            result = self._socket.recv()
        except Timeout as timeoutException:
            if timeoutException == timeout:
                printInDebugMode(
                    'Couldn\'t send message, server response timed'
                    ' out after %d seconds' % receiveTimeout)

            self._disconnect()
        except Exception as e:
            printInDebugMode('Couldn\'t send message, unexpected exception'
                             ' while receiving response from server: %s' % e)
            self._disconnect()
        finally:
            timeout.cancel()

        return result == '1'
예제 #30
0
def comments_crawler():
    '''
    greenlet comments crawler
    '''
    while not comments_fetch_queue.empty():
        IS_NEED_REFETCH = False #when timeout or errors occur,put the url back into the task queue and the make sure the task is not set to done!
        try:
            wait_time = Timeout(MAX_WAIT_TIME)
            wait_time.start()
            url = comments_fetch_queue.get()
            gevent.sleep()
            comments_time = _http_call(url)
            for status in comments_time['comments']:
                weibo_created_at = datetime.strptime(status.get('created_at'), '%a %b %d %H:%M:%S +0800 %Y')
                user_created_at = datetime.strptime(status.get('user').get('created_at'), '%a %b %d %H:%M:%S +0800 %Y')
                comments_status_id = -1

                if status['status'] is not None:
                    global retweeted_status_id
                    comments_status = status['status']
                    comments_status_id = comments_status['id']

                weibo_params = (status['id'], status['user']['id'], status['text'], status['source'], weibo_created_at, comments_status_id)
                user_params = (status['user']['id'], status['user']['screen_name'], status['user']['name'], status['user']['province'], status['user']['city'], status['user']['location'], status['user']['description'], status['user']['profile_image_url'], status['user']['domain'], status['user']['gender'],
                               status['user']['followers_count'], status['user']['friends_count'], status['user']['statuses_count'], status['user']['favourites_count'], user_created_at, status['user']['verified'], status['user']['verified_type'], status['user']['verified_reason'], status['user']['bi_followers_count'] )
                cursor.execute(COMMENTS_WEIBO_INSERT_SQL, weibo_params)
                cursor.execute(COMMENTS_USER_INSERT_SQL, user_params)

        except Timeout as t:
            if t is wait_time:
#                print '处理超时,等待重新抓取!'
                #put timeout url back into the task queue
                IS_NEED_REFETCH = True
        except Exception as e:
            print e.message
            print sys.exc_info()
        finally:
            wait_time.cancel()
            if IS_NEED_REFETCH is not True:
                comments_fetch_queue.task_done()
#                print url + ' 抓取完成 --- 评论'
            else:
                comments_fetch_queue.put(url)
예제 #31
0
    def cancel_port_forward(self, address, port):
        """\
        Cancel a port forwarding request. This cancellation request is sent to the server and
        on the server the port forwarding should be unregistered.

        @param address: remote server address
        @type address: C{str}
        @param port: remote port
        @type port: C{int}

        """
        timeout = Timeout(10)
        timeout.start()
        try:
            self.ssh_transport.global_request('cancel-tcpip-forward', (address, port), wait=True)
        except:
            pass
        finally:
            timeout.cancel()
    def _send(self, message, receiveTimeout=_RECEIVE_TIMEOUT):
        '''
        Tries to send a message to the server and waits a fixed amount of
        seconds for it to answer.

        @param message: message to send to the server
        @type message: str

        @param receiveTimeout: max time in seconds in which the server should respond
        @type receiveTimeout: int
        '''

        if not self._isConnected:
            self._connect()

        try:
            self._socket.send(message)
        except Exception as e:
            self._disconnect()
            printInDebugMode('Couldn\'t send message, unexpected exception'
                             ' while sending to server: %s' % e)

        result = None

        timeout = Timeout(receiveTimeout)
        timeout.start()

        try:
            result = self._socket.recv()
        except Timeout as timeoutException:
            if timeoutException == timeout:
                printInDebugMode('Couldn\'t send message, server response timed'
                                 ' out after %d seconds' % receiveTimeout)

            self._disconnect()
        except Exception as e:
            printInDebugMode('Couldn\'t send message, unexpected exception'
                             ' while receiving response from server: %s' % e)
            self._disconnect()
        finally:
            timeout.cancel()

        return result == '1'
예제 #33
0
def get_git_refs():
    if DISABLE_NEW_EXTENSIONS:
        return 'Disabled', 403

    git_endpoint = request.values.get('ep', None)
    if git_endpoint is None:
        return jsonify(error={'message': 'Missing endpoint'}), 400

    if not git_endpoint.endswith('.git'):
        return jsonify(error={'message': 'Invalid git endpoint'}), 400

    git_path = config.get('MINEMELD_GIT_PATH', None)
    if git_path is None:
        return jsonify(error={'message': 'MINEMELD_GIT_PATH not set'}), 500

    git_args = [git_path, 'ls-remote', '-t', '-h', git_endpoint]

    git_process = Popen(
        args=git_args,
        close_fds=True,
        shell=False,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    timeout = Timeout(20.0)
    timeout.start()
    try:
        git_stdout, git_stderr = git_process.communicate()

    except Timeout:
        git_process.kill()
        return jsonify(error={'message': 'Timeout accessing git repo'}), 400

    finally:
        timeout.cancel()

    if git_process.returncode != 0:
        LOG.error('Error running {}: {}'.format(git_args, git_stderr))
        return jsonify(error={'message': 'Error running git: {}'.format(git_stderr)}), 400

    return jsonify(result=[line.rsplit('/', 1)[-1] for line in git_stdout.splitlines()])
예제 #34
0
def get_git_refs():
    if DISABLE_NEW_EXTENSIONS:
        return 'Disabled', 403

    git_endpoint = request.values.get('ep', None)
    if git_endpoint is None:
        return jsonify(error={'message': 'Missing endpoint'}), 400

    if not git_endpoint.endswith('.git'):
        return jsonify(error={'message': 'Invalid git endpoint'}), 400

    git_path = config.get('MINEMELD_GIT_PATH', None)
    if git_path is None:
        return jsonify(error={'message': 'MINEMELD_GIT_PATH not set'}), 500

    git_args = [git_path, 'ls-remote', '-t', '-h', git_endpoint]

    git_process = Popen(args=git_args,
                        close_fds=True,
                        shell=False,
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)

    timeout = Timeout(20.0)
    timeout.start()
    try:
        git_stdout, git_stderr = git_process.communicate()

    except Timeout:
        git_process.kill()
        return jsonify(error={'message': 'Timeout accessing git repo'}), 400

    finally:
        timeout.cancel()

    if git_process.returncode != 0:
        LOG.error('Error running {}: {}'.format(git_args, git_stderr))
        return jsonify(
            error={'message': 'Error running git: {}'.format(git_stderr)}), 400

    return jsonify(
        result=[line.rsplit('/', 1)[-1] for line in git_stdout.splitlines()])
예제 #35
0
 def extractor(filename):
     interpreter = POPEN(binary + [filename] + append, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     
     timeout = Timeout(getattr(settings, 'EXTRACTION_TIMEOUT', 120), ChildTimeout)
     timeout.start()
     try:
         output, run_error = interpreter.communicate('')
         timeout.cancel()
     except ChildTimeout:
         print 'killing %s' % filename
         interpreter.kill()
         raise
     
     if (output_type == 'text' and not output.strip()) or (output_type == 'html' and html_is_empty(output)) or (error and (error in output or error in run_error)):
         raise ExtractionFailed()
     elif output_type == 'html':
         # strip non-breaking spaces
         return _nbsp.sub(' ', output)
     else:
         return output
예제 #36
0
def test():
    import filecmp
    import shutil

    # Mock urlopen
    def mocked_urlopen(url):
        usplit = urlsplit(url)
        if 'test_site' not in  usplit.hostname:
            sys.exit('Crawler tried to visit external host')

        url_path = usplit.path.lstrip('/')
        real_path = os.path.join('test_site', url_path)
        if os.path.isdir(real_path):
            real_path = os.path.join(real_path, 'index.html')
        if os.path.isfile(real_path):
            return open(real_path, 'rb')
        else:
            raise IOError('Error 404: Not Found')
    global urlopen
    urlopen = mocked_urlopen

    # Clean target directory
    if os.path.exists(TMPDIR):
        shutil.rmtree(TMPDIR)
    os.mkdir(TMPDIR)

    timeout = Timeout(15)
    try:
        timeout.start()
        Crawler('http://test_site', JOBS, TMPDIR).run()
    except Timeout:
        sys.exit('Test timeout')
    finally:
        timeout.cancel()

    # Compare dirs
    # Not very accurate measure
    if len(os.listdir('test_site/cats')) + len(os.listdir('test_site/cats2')) != len(os.listdir(TMPDIR)):
        sys.exit('Test failed. Number of cats differs'.format(file))

    print 'Test OK.'
예제 #37
0
 def write(self, buf):
     if self.timeout is None:
         timeout = None
     else:
         timeout = Timeout(self.timeout)
         timeout.start()
     buf = compat.buffer(buf)
     byteswritten = 0
     try:
         while byteswritten != len(buf):
             nbytes = fd.write(self.fd, buf[byteswritten:])
             assert nbytes != 0
             byteswritten += nbytes
     except Timeout as e:
         if e is not timeout:
             raise
         raise TIMEOUT('Timeout writing to fd')
     else:
         if timeout is not None:
             timeout.cancel()
     return len(buf)
예제 #38
0
    def delete(self, name):
        self.logger.debug("Preparing to execute delete()")
        timeout = Timeout(self.script_timeout)
        process = Popen(self.scriptfile_path + " -d -q" + " -n " + name,
                        stdout=PIPE,
                        shell=True)

        output = "Delete operation exceeded execution timeout.\n"
        returncode = 1
        timeout.start()
        try:
            output = process.communicate()[0]
            returncode = process.returncode
        except Timeout:
            process.kill()
            self.logger.warn(
                "Process %s servicing delete() " +
                "exceeded execution timeout and was terminated.", process.pid)
            if process.returncode is not None:
                returncode = process.returncode
        finally:
            timeout.cancel()
        return [output, returncode]
예제 #39
0
def load_page(url):
    """Load page, recoder url which is new found, recoder url which is done
    """
    timeout = Timeout(random.uniform(20,25))
    timeout.start()
    try:
        r, status_code = http_get(url)
        if status_code == 302:
            add_url_done(url)
            return
        elif status_code != 200:
            logger.info('[url] %s [status_code] %s' % (url, status_code))
            add_url_done(url)
            return

        data = r.text  
    except Timeout, e:
        is_retry = move_doing2new(url)
        if not is_retry:
            logger.info('[url] %s [err_last_retry] %s' % (url, e))
        else:
            logger.info('[url] %s [err_retry] %s' % (url, e))
        timeout.cancel()
        return
예제 #40
0
        def method(*args, **kwds):

            # 从连接池获取连接
            client = self.get_client_from_pool()

            # 连接池中无连接
            if client is None:
                # 设置获取连接的超时时间
                time_out = Timeout(self.get_connection_timeout)
                time_out.start()
                try:
                    async_result = AsyncResult()
                    self.no_client_queue.appendleft(async_result)
                    client = async_result.get()  # blocking
                except:
                    with self.lock:
                        if client is None:
                            self.no_client_queue.remove(async_result)
                            self.logger.error("Get Connection Timeout!")
                finally:
                    time_out.cancel()

            if client is not None:

                for i in xrange(self.max_renew_times):

                    try:
                        put_back_flag = True
                        client.last_use_time = time.time()
                        fun = getattr(client, name, None)
                        return fun(*args, **kwds)
                    except socket.timeout:
                        self.logger.error("Socket Timeout!")
                        # 关闭连接,不关闭会导致乱序
                        put_back_flag = False
                        self.close_one_client(client)
                        break

                    except TTransportException, e:
                        put_back_flag = False

                        if e.type == TTransportException.END_OF_FILE:
                            self.logger.warning("Socket Connection Reset Error,%s", e)
                            with self.lock:
                                client.close()
                                self.pool_size -= 1
                                client = self.get_new_client()
                        else:
                            self.logger.error("Socket Error,%s", e)
                            self.close_one_client(client)
                            break

                    except socket.error, e:
                        put_back_flag = False
                        if e.errno == socket.errno.ECONNABORTED:
                            self.logger.warning("Socket Connection aborted Error,%s", e)
                            with self.lock:
                                client.close()
                                self.pool_size -= 1
                                client = self.get_new_client()
                        else:
                            self.logger.error("Socket Error, %s", e)
                            self.close_one_client(client)
                            break

                    except Exception as e:
                        put_back_flag = False

                        self.logger.error("Thrift Error, %s", e)
                        self.close_one_client(client)
                        break
예제 #41
0
파일: events.py 프로젝트: sahwar/attachix
    def get_Files(self, request):
        request.setResponseCode(200)
        request.setHeader('Content-Type', 'text/event-stream')
        request.setHeader('Expires', 'Fri, 01 Jan 1990 00:00:00 GMT')
        request.setHeader('Cache-Control',
                          'no-cache, no-store, max-age=0, must-revalidate')
        request.setHeader('Pragma', 'no-cache')
        request.setHeader('Access-Control-Allow-Origin', '*')

        retry = 1000  # retry time in miliseconds
        wait = 15
        if request.env.has_key('HTTP_X_SUBSCRIPTION_WAIT'):
            try:
                wait = int(request.env['HTTP_X_SUBSCRIPTION_WAIT'])
            except Exception:
                pass

        # @todo: Figure out if the connection should be closed or not
        close = True

        eventQueue = Event()
        userObject = self.getUser(request)
        if not userObject:
            return

        lastId = None
        try:
            if request.env.has_key('HTTP_LAST_EVENT_ID'):
                lastId = request.env['HTTP_LAST_EVENT_ID']
            elif request.params.has_key('Last-Event-ID'):
                lastId = request.params['Last-Event-ID']

            if lastId is not None:
                lastId = "%.4f" % lastId
        except Exception:
            pass

        channel = "%s.files" % userObject.get('id')
        events = eventQueue.subscribe(channel, lastId)
        timeout = None
        try:
            """
            @warninig: If the connection is closed from haproxy for some reason
                       this connection is not recycled. That's why we close the
                       connection after 15 seconds (the haproxy timeout is set
                       to 20 seconds).
            @todo:     Figure out why the closing on the other side is not detected
                       and the connection hangs
            @todo:     Figure out how to set a timeout that sends noop command
                       every 15 seconds and keeps the connection alive for ever.
            """
            if close:
                timeout = Timeout(wait).start()
            for event in events:
                if request.env.has_key('BASE_URI'):
                    event = re.sub(
                        r'"resource":\s*"(.*?)"',
                        '"resource": "%s\\1"' % request.env['BASE_URI'], event)

                if request.env.has_key('URI_REPLACE'):
                    match = re.search(r'"resource":\s*"(.*?)"', event)
                    if match:
                        uri = match.group(1).replace(
                            request.env['URI_REPLACE'][0],
                            request.env['URI_REPLACE'][1])
                        event = re.sub(r'"resource":\s*"(.*?)"',
                                       '"resource": "%s"' % uri, event)

                text = ": %s\n" % ''.center(2049, ' ')  # 2kb padding for IE
                text += "data: {\"files\": %s}\n" % event
                match = re.search(r'"time":\s*([0-9.]+)', event)
                if match:
                    text += "id: %s\n\n" % match.group(1)
                yield text

                if not close:
                    continue

                # if we have to close then reset the timeout
                if timeout is not None:
                    timeout.clear()
                timeout = Timeout(wait).start()
                yield ":noop\n"
        except Timeout:
            yield "retry: %d" % retry
        finally:
            if hasattr(events, 'close') and callable(events.close):
                events.close()
            if timeout is not None:
                timeout.cancel()
예제 #42
0
파일: spider.py 프로젝트: Catcherman/ark
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    def __init__(self,
                 concurrent_num=10,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=['send_url_to_celery'],
                 depth=10,
                 max_url_num=3000000,
                 internal_timeout=20,
                 spider_timeout=6 * 3600,
                 crawler_mode=1,
                 same_origin=True,
                 dynamic_parse=True):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        ]

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(
            maxsize=self.concurrent_num * 10000)
        self.crawler_queue = threadpool.Queue(
            maxsize=self.concurrent_num * 10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = [
            'a', 'base', 'iframe', 'frame', 'object', 'framset'
        ]
        self.ignore_ext = [
            'cab', 'ico', 'swf', 'rar', 'zip', 'tar', 'gz', 'js', '7z', 'bz2',
            'iso', 'nrg', 'uif', 'exe', 'rpm', 'deb', 'dmg', 'jar', 'jad',
            'bin', 'apk', 'run', 'msi', 'xls', 'xlsx', 'ppt', 'pptx', 'pdf',
            'doc', 'docx', 'odf', 'rtf', 'odt', 'mkv', 'avi', 'mp4', 'flv',
            'WebM', 'mov', 'wmv', '3gp', 'mpg', 'mpeg', 'mp3', 'wav', 'ss3',
            'ogg', 'mp4a', 'wma', 'png', 'jpeg', 'jpg', 'xpm', 'gif', 'tiff',
            'css', 'bmp', 'svg', 'exif', 'thmx', 'xml', 'txt'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  #注册Crawler中使用的插件
        self.custom_headers = {'User-Agent': random.choice(USER_AGENTS)}

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()  #等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(
            self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  #维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                link_generator = HtmlAnalyzer.extract_links(
                    url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    # 增加url相似性判断,详见urlfilter.py
                    if not self.check_url_similar(link):
                        continue
                    # 增加url重复判断,详见urlfilter.py
                    if not self.check_url_repeat(link):
                        continue
                    if curr_depth > self.depth:  #最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.
                           fetcher_cache) == self.max_url_num:  #最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)

                for plugin_name in self.plugin_handler:  #循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()
예제 #43
0
    def run(self):
        """
        While determined to be running, run loop is:

        - check if total time has been violated
            - if max time has been validated log and put in finished state
        - wait X seconds for next state
        - yield

        :return:
        """
        # apply the first state so we can follow event loop flow
        self.event_result_q.put_nowait(self.machine.events.first_state())

        logger.debug(
            '%s', {
                'message': 'sending_first_state',
                'first_state': self.machine.events.first_state()
            })

        timeout = Timeout(self.machine.max_timeout)
        timeout.start()
        try:

            while self.machine.is_running():
                # how do we support a general overarching timeout
                # and a specific one for the current running event
                try:
                    # we can ignore the next state, this is only used to indicate
                    # when it's time to apply a transition
                    result = self.event_result_q.get()

                except gevent.queue.Empty:
                    logger.debug('%s', {
                        'message': 'queue_empty',
                    })

                else:
                    if result == EVENT_RESULT.FAILURE:
                        logger.debug('%s', {'message': 'task_failure'})
                        return False

                    logger.debug('%s', {
                        'message': 'state_change_requested',
                    })
                    self.machine.events.teardown_current()
                    self.machine.next_state()

                    if self.machine.state == STATES.FINISHED:
                        logger.debug(
                            '%s', {
                                'message': 'task_execution_finished',
                                'status': 'SUCCESS',
                            })
                        return True

                    self.machine.run_current_event(
                        event_result_q=self.event_result_q)

        except Timeout:
            logger.error(
                '%s', {
                    'message': 'task timeout reached',
                    'timeout': self.machine.max_timeout,
                    'units': 'seconds'
                })
            return False

        finally:
            timeout.cancel()

        return True
예제 #44
0
#!/usr/bin/env python
# encoding: utf-8

import gevent
from gevent import Timeout

seconds = 10

timeout = Timeout(seconds)
timeout.start()


def wait():
    gevent.sleep(10)


try:
    gevent.spawn(wait).join()
except Timeout:
    print('Could not complete')
finally:
    timeout.cancel()
예제 #45
0
#!/usr/bin/env python
# encoding: utf-8

import gevent
from gevent import Timeout

seconds = 10

timeout = Timeout(seconds)
timeout.start()

def wait():
    gevent.sleep(10)

try:
    gevent.spawn(wait).join()
except Timeout:
    print('Could not complete')
finally:
    timeout.cancel()

예제 #46
0
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()  #等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" %
                         (len(self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  #维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                link_generator = HtmlAnalyzer.extract_links(
                    url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  #最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache
                           ) == self.max_url_num:  #最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)
                self.crawler_queue.task_done()
예제 #47
0
파일: spider.py 프로젝트: zhupite233/scaner
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=10,
                 max_url_num=3000,
                 internal_timeout=60,
                 spider_timeout=1800,
                 dir_max_url=15,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False,
                 login_dict={},
                 scan_task_id=0):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)
        self.spider_stop_time = time() + spider_timeout
        self.crawler_mode = crawler_mode  # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = [
            'script', 'a', 'base', 'iframe', 'frame', 'object'
        ]
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip', 'swf', 'ico'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dir_max_url = dir_max_url
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit(login_dict)
            if login_dict:
                self.webkit.auto_login()
            # elif custom_headers.get('Cookie'):
            #
            #     self.webkit.set_cookie(custom_headers)

        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        self.custom_headers = custom_headers
        self.scan_task_id = scan_task_id

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            crawler = Crawler(self)
            self.crawler_pool.spawn(crawler._run())

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        self._start_fetcher()
        # sleep(60)
        self._start_crawler()
        print 2222222222222
        self.stopped.wait()  # 等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" %
                         (len(self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def feed_url(self, url):
        '''
        设置初始爬取URL
        '''
        if isinstance(url, basestring):
            url = to_unicode(url)
            url = UrlData(url)

        if self.same_origin:
            url_part = urlparse.urlparse(unicode(url))
            self.origin = (url_part.scheme, url_part.netloc)

        self.fetcher_queue.put(url, block=True)

    def stop(self):
        '''
        终止爬虫
        '''
        self.stopped.set()
예제 #48
0
파일: events.py 프로젝트: slaff/attachix
    def get_Files(self, request):
        request.setResponseCode(200)
        request.setHeader('Content-Type', 'text/event-stream')
        request.setHeader('Expires', 'Fri, 01 Jan 1990 00:00:00 GMT')
        request.setHeader('Cache-Control', 'no-cache, no-store, max-age=0, must-revalidate')
        request.setHeader('Pragma','no-cache')
        request.setHeader('Access-Control-Allow-Origin', '*')
        
        retry = 1000 # retry time in miliseconds
        wait = 15
        if request.env.has_key('HTTP_X_SUBSCRIPTION_WAIT'):
            try:
                wait = int(request.env['HTTP_X_SUBSCRIPTION_WAIT'])
            except Exception:
                pass

        # @todo: Figure out if the connection should be closed or not
        close = True

        eventQueue = Event()
        userObject = self.getUser(request)
	if not userObject:
	    return

        lastId = None
        try:
            if request.env.has_key('HTTP_LAST_EVENT_ID'):
                lastId =  request.env['HTTP_LAST_EVENT_ID']
            elif request.params.has_key('Last-Event-ID'):
                lastId = request.params['Last-Event-ID']

            if lastId is not None:
                lastId = "%.4f" % lastId
        except Exception:
                pass

        channel = "%s.files" % userObject.get('id')
        events = eventQueue.subscribe(channel, lastId)
	timeout = None
        try:
            """
            @warninig: If the connection is closed from haproxy for some reason
                       this connection is not recycled. That's why we close the
                       connection after 15 seconds (the haproxy timeout is set
                       to 20 seconds).
            @todo:     Figure out why the closing on the other side is not detected
                       and the connection hangs
            @todo:     Figure out how to set a timeout that sends noop command
                       every 15 seconds and keeps the connection alive for ever.
            """
            if close:
                timeout = Timeout(wait).start()
            for event in events:
                if request.env.has_key('BASE_URI'):
                    event = re.sub(r'"resource":\s*"(.*?)"', '"resource": "%s\\1"' % request.env['BASE_URI'], event)

                if request.env.has_key('URI_REPLACE'):
                    match = re.search(r'"resource":\s*"(.*?)"', event)
                    if match:
                        uri = match.group(1).replace(request.env['URI_REPLACE'][0],request.env['URI_REPLACE'][1])
                        event = re.sub(r'"resource":\s*"(.*?)"', '"resource": "%s"' % uri, event)

                text  = ": %s\n" % ''.center(2049,' ') # 2kb padding for IE
                text += "data: {\"files\": %s}\n" % event
                match = re.search(r'"time":\s*([0-9.]+)',event)
                if match:
                    text += "id: %s\n\n" % match.group(1)
                yield text

                if not close:
                    continue

                # if we have to close then reset the timeout
                if timeout is not None:
                    timeout.clear()
                timeout = Timeout(wait).start()
                yield ":noop\n"
        except Timeout:
            yield "retry: %d" % retry
        finally:
            if hasattr(events, 'close') and callable(events.close):
                events.close()
	    if timeout is not None:
                timeout.cancel()
예제 #49
0
    def send(self, data, max_bytes=None, retry_wait=0.25):
        """ Socket Wrapper for people using this class as if it were just
            a python socket class; always ignore EINTR flags to finish
            sending the data
        """

        # track bytes written
        tot_bytes = 0

        # Get reference time
        cur_time = datetime.now()

        # Current elapsed time
        elapsed_time = 0.0

        if not max_bytes:
            max_bytes = len(data)

        while self.connected and (max_bytes - tot_bytes) > 0:

            # This timer is used to determine how long we should
            # wait before assuming we can't send content and that we
            # have connection problems.

            stale_timeout = max(((max_bytes - tot_bytes) / 10800.0), 15.0)

            if not self.can_write(stale_timeout):
                # can't write down pipe; something has gone wrong
                self.close()
                raise SocketException('Connection write wait timeout')

            # bump 10 seconds onto our timeout
            stale_timeout += 10

            # Initialize our stale timeout timer
            stale_timer = Timeout(stale_timeout)

            # Update Elapsed Time
            elapsed_time = datetime.now() - cur_time
            elapsed_time = (elapsed_time.days * 86400) \
                + elapsed_time.seconds + (elapsed_time.microseconds / 1e6)

            try:
                # Start stale_timer
                stale_timer.start()

                # Send data
                bytes_sent = self.socket.send(
                    data[tot_bytes:tot_bytes + max_bytes],
                )
                stale_timer.cancel()

                # Get our elapsed transfer time
                elapsed_xfer_time = datetime.now() - cur_time
                elapsed_xfer_time = (
                    (elapsed_xfer_time.days * 86400) +
                    elapsed_xfer_time.seconds +
                    (elapsed_xfer_time.microseconds / 1e6)) - elapsed_time

                if not bytes_sent:
                    self.close()
                    raise SocketException('Connection lost')

                # Handle content received
                tot_bytes += bytes_sent

                # Call write hook
                self.hooks.call(
                    'socket_write',
                    # Host information
                    host=self._remote_addr,
                    port=self._remote_port,
                    # The number of bytes read
                    xfer_bytes=bytes_sent,
                    # The time it took to read these bytes
                    xfer_time=elapsed_xfer_time,
                    # Our sockets
                    socket=weakref.proxy(self),
                )

            except Timeout:
                # Timeout occurred; Sleep for a little bit
                sleep(retry_wait)
                if self.can_write() is None:
                    self.close()
                    raise SocketException('Connection broken due to timeout')

            except socket.error, e:
                if e[0] == errno.EAGAIN:
                    # Timeout occurred; Sleep for a little bit
                    sleep(retry_wait)
                    if self.can_write() is None:
                        self.close()
                        raise SocketException('Connection broken')

                elif e[0] == errno.EINTR:
                    # A Signal was caught, resend this data before
                    # raising the signal higher.  signals can wait when
                    # there is data flowing
                    raise SignalCaughtException('Signal received')
                else:
                    # errno.EPIPE (Broken Pipe) usually at this point
                    self.close()
                    raise SocketException('Connection lost')
            except Exception, e:
                self.close()
                raise
예제 #50
0
파일: vulcan.py 프로젝트: Flygend/vulcan
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    #   相当于构造函数
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        #   日志模块
        self.logger.setLevel(logging.DEBUG)  #   日志级别
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")  #   日志格式
        hd = logging.StreamHandler()
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout  #   内部调用超时时间
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #   爬取器模型
        self.concurrent_num = concurrent_num  #   并行crawler与fetcher数量

        #   fetcher使用gevent模型
        self.fetcher_pool = pool.Pool(self.concurrent_num)

        #   crawler模型设置
        #   crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler
        if self.crawler_mode == 0:
            #   线程池模型
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            #   gevent模型
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #   fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame',
                                   'object']  #   默认的爬行时收集URL所属标签列表
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]  #   爬行时忽略的URL种类
        self.crawl_tags = list(set(self.default_crawl_tags)
                               | set(crawl_tags))  #   爬行时收集URL所属标签列表
        self.same_origin = same_origin  #   是否同源
        self.depth = depth  #   爬行深度限制
        self.max_url_num = max_url_num  #   最大收集URL数量
        self.dynamic_parse = dynamic_parse  #   是否使用WebKit动态解析

        #   如果开启动态解析
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        #   自定义HTTP头
        self.custom_headers = custom_headers

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):  #   concurrent_num:并行数量
            fetcher = Fetcher(self)  #   实例化一个 fetcher
            self.fetcher_pool.start(fetcher)  #   调用start()开始执行_run()

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)  #   启动self.crawler()函数

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        #   开启fetcher与crawler
        self._start_fetcher()  #   初始爬取URL已在main函数中spider.feed_url(url)中设置
        self._start_crawler()

        self.stopped.wait()  # 等待停止事件置位

        #   等待fetcher与crawler执行结束
        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" %
                         (len(self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  # 维护爬虫池
                url_data = self.crawler_queue.get(
                    block=False)  #   从爬取队列取出一个URL
            except queue.Empty, e:  #   队列为空
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:  #   全部处理完毕
                    self.stop()
                else:  #   fetcher没有处理完毕
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1  #   当前深度

                #   生成URL list
                link_generator = HtmlAnalyzer.extract_links(
                    url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:  #   WebKit动态解析
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))  #   去重

                #   遍历解析出的URL list
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  # 最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache
                           ) == self.max_url_num:  # 最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    #   此处调整顺序,应该先加入下载队列
                    self.fetcher_queue.put(url, block=True)
                    self.fetcher_cache.insert(url)  #   加入到已经处理fetcher队列

                #   插件部分,暂时不关注
                for plugin_name in self.plugin_handler:  # 循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()