Пример #1
0
    def __init__(self):
        esurl = Configure.configure().value("elasticsearch.url")
        poolsize = Configure.configure().value("elasticsearch.pool.maxsize")

        self._es_url = esurl
        self._pool_maxsize = poolsize
        self._es_domain = None

        if esurl.startswith("http://"):
            self._es_domain = esurl[7:]
        elif esurl.startswith("https://"):
            self._es_domain = esurl[8:]

        idx = self._es_domain.find("/")
        self._es_domain = self._es_domain[:idx]
        #     httpexp = re.compile(r'^https?://')
        #     httpma = httpexp.search(self._es_url)
        #     print (httpma.span())
        #     s,e=httpma.span()
        #     domain = p_es_url[e:]
        #     i = domain.find("/")
        #     if i >= 0:
        #       domain=domain[:i]
        #     print (domain)
        print("extract domain", self._es_domain)
        self._es_client = urllib3.HTTPConnectionPool(
            self._es_domain, maxsize=self._pool_maxsize)
Пример #2
0
    def __init__(self, p_request_queue, p_alive_timeout=None):
        threading.Thread.__init__(self)
        self._iniWinHeight = Configure.configure().value(
            "headless.webdriver.iniBrowserWinHeight")
        self._iniWinWidth = Configure.configure().value(
            "headless.webdriver.iniBrowserWinWidth")
        self._driver_path = Configure.configure().value(
            "headless.webdriver.path")

        self._input = queue.Queue(1)
        self._output = asyncio.Queue(maxsize=1)

        self._alive_timeout = p_alive_timeout

        self._request_queue = p_request_queue
        self._events = {"getPage": self.getPage, "snapshot": self.getSnapShot}

        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=" + str(self._iniWinWidth) +
                                    "x" + str(self._iniWinHeight))
        driver = webdriver.Chrome(chrome_options=chrome_options,
                                  executable_path=self._driver_path)
        self._driverwrapper = {
            "driver": driver,
            "instancetime": datetime.datetime.now(),
            "lastactivetime": datetime.datetime.now(),
            "usetimes": 0
        }
Пример #3
0
    def execute(self, p_job, p_exectime):
        scenario = p_job["scenario"]
        sid = scenario["scenarioId"]
        outputdir = Configure.configure().value("application.outputdir")

        rootdir = Configure.configure().value("application.rootdir")
        workdir = rootdir + "/" + "s" + "_" + sid + "_" + p_exectime
        tmpdir = rootdir + "/" + "s" + "_" + sid + "_" + p_exectime + "/tmp"
        print("task dir: %s, %s" % (workdir, tmpdir))
Пример #4
0
  def fuckup(p_command=None):
    Main.rootdir = os.path.abspath('.')
    manager = Manager()
    #BaseManager.register('CrawlerPicker', CrawlerPicker)
    #manager = BaseManager()
    
    #Initialize application configure
    filename = "application-config.yml"
    Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command)
        
    #Initialize log    
    Logger()

    #Initialize elasticsearch client
    ESHandler.ini()
    
    #Initialize job schedule
    #main_jod_queue = queue.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000))
    main_jod_queue = ThreadSafeQueue(size=Configure.configure().value("scheduler.messageQueueSize", p_default=1000))

    crawler_picker = CrawlerPicker()
    Main.crawlerRegister = CrawlerRegister(p_crawler_picker=crawler_picker, p_main_jod_queue=main_jod_queue)
    Main.crawlerRegister.start()

    #main_jod_queue = manager.Queue(Configure.configure().value("scheduler.messageQueueSize", p_default=1000))
    #main_jod_queue = Queue(maxsize=Configure.configure().value("scheduler.messageQueueSize", p_default=1000))
    
    Main.parellelSchedule=ParellelSchedule(p_main_jod_queue=main_jod_queue)
    Main.parellelSchedule.start()
    #Main.parellelSchedule.run()
    
    #Main.crawlerRegister.daemon = True
    #Main.crawlerRegister.run()
    
    #registerserver = Configure.configure().value("server.crawler.healthServer.host")
    #registerport = Configure.configure().value("server.crawler.healthServer.port")
    #Main.jobSync = JobSync(p_queue=main_jod_queue, p_register={"host":registerserver, "port":registerport}, p_crawler_picker=crawler_picker)
    #Main.jobSync.start()
    #Start main thread loop
    #tornado.ioloop.IOLoop.current().start()
    
    #After start all sub process, we need invode join function to make shared object available
    #Main.jobSync.join()
    Main.crawlerRegister.join()
    
    #Initialize server
    try:
        # This is here to simulate application activity (which keeps the main thread alive).
        while True:
            time.sleep(2)
    except (KeyboardInterrupt, SystemExit):
      pass    
Пример #5
0
Файл: hub.py Проект: xlybaby/VAR
 def start(self):
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM, 0)
     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
     # set send buffer size
     sock.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF,
                     self._send_buffer_size)
     # set recieve buffer size
     sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF,
                     self._recv_buffer_size)
     #sock.setblocking(0)
     sock.bind((self._host, self._port))
     sock.listen(128)
     while True:
         conn, addr = sock.accept()
         print("Get connection accept")
         request = {
             "conn": conn,
             "addr": addr,
             "delimiter": self._delimiter
         }
         self._queue.put(request,
                         block=True,
                         timeout=Configure.configure().value(
                             "headless.webdriver.requestWaittingTimeout"))
         print("Put connection to queue")
Пример #6
0
Файл: hub.py Проект: xlybaby/VAR
    def __init__(self,
                 p_name,
                 p_host,
                 p_port,
                 p_sendBufferSize,
                 p_recvBufferSize,
                 p_queue,
                 p_max_buffer_size=None,
                 p_read_chunk_size=None,
                 p_delimiter="\n"):
        TCPServer.__init__(self,
                           max_buffer_size=p_max_buffer_size,
                           read_chunk_size=p_read_chunk_size)
        self._name = p_name
        self._host = p_host
        self._port = p_port

        self._send_buffer_size = p_sendBufferSize
        self._recv_buffer_size = p_recvBufferSize

        self._delimiter = p_delimiter
        self._queue = p_queue

        self._request_waiting_timeout = Configure.configure().value(
            p_key="headless.webdriver.requestWaittingTimeout")
Пример #7
0
    def getPage(self, p_body):
        #driverwrapper = None
        try:
            flag = p_body["flag"] if "flag" in p_body else None
            addr = p_body["addr"] if "addr" in p_body else None
            if addr == None:
                return "None"
            Logger.getLogger().info("Get page source: %s" % (addr))

            #driverwrapper = self._driver_queue.get(timeout=Configure.configure().value("headless.webdriver.freeDriverWaittingTimeout"))
            #Logger.getLogger().info("Got driver")
            self._driverwrapper["lastactivetime"] = datetime.datetime.now()
            self._driverwrapper[
                "usetimes"] = self._driverwrapper["usetimes"] + 1

            driver = self._driverwrapper["driver"]
            driver.set_page_load_timeout(Configure.configure().value(
                "headless.webdriver.driverGetPageTimeout"))
            #await asyncio.sleep(40)
            driver.get(addr)
            Logger.getLogger().info("Get page source done")
            return driver.page_source
        except queue.Empty:
            Logger.getLogger().error("Driver pool is empty")
            #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"})
            return "None"
        except TimeoutException:
            Logger.getLogger().error("Driver get page timeout")
            #return json.dumps({"status": StatusCode.ERROR, "message": "No available driver"})
            return "None"
        finally:
            pass
Пример #8
0
    async def callback(self, p_message=None):
        ret = None
        proc = None
        print("Request coming: ", p_message)
        try:
            proc = await asyncio.wait_for(
                self._driver_process_queue.get(),
                Configure.configure().value(
                    "headless.webdriver.freeDriverWaittingTimeout"))
            Logger.getLogger().info("Got a web driver")
            proc.getInputQueue().put(p_message, block=False)
            print("put message: ", p_message)
            #await proc.execute(p_message)
            outq = proc.getOutputQueue()
            print("Waiting for response: ")
            ret = await asyncio.wait_for(outq.get(), timeout=None)
            print("Got response: ", ret)
            return ret

        except asyncio.TimeoutError:
            Logger.getLogger().error("Can't get free web driver")
            return "None"
        finally:
            if proc != None:
                self._driver_process_queue.put_nowait(proc)
Пример #9
0
 def __init__(self, p_queue, p_request_queue):
     #Process.__init__(self)
     threading.Thread.__init__(self)
     self._ini_driver_num = Configure.configure().value(
         "headless.webdriver.iniBrowserNum")
     self._request_queue = p_request_queue
     #self._driver_index_map = {}
     #self._driver_queue = p_queue
     self._driver_process_queue = p_queue
Пример #10
0
Файл: hub.py Проект: xlybaby/VAR
    def startlisten(p_name, p_prefix, p_queue, p_delimiter):
        port = Configure.configure().value(p_key=p_prefix + ".port")
        host = Configure.configure().value(p_key=p_prefix + ".host")
        sendBufferSize = Configure.configure().value(p_key=p_prefix +
                                                     ".sendBufferSize")
        recvBufferSize = Configure.configure().value(p_key=p_prefix +
                                                     ".recvBufferSize")

        server = StreamHandler(p_name=p_name,
                               p_host=host,
                               p_port=port,
                               p_sendBufferSize=sendBufferSize,
                               p_recvBufferSize=recvBufferSize,
                               p_queue=p_queue,
                               p_delimiter=p_delimiter)
        #server.listen( port )
        print("Server[" + p_name + "] starts at " + str(port) + "...")
        server.start()
Пример #11
0
 def __init__(self):
   self._level = {"INFO":logging.INFO, "WARNING":logging.WARNING, "DEBUG":logging.DEBUG, "ERROR":logging.ERROR}
   logdir = Configure.configure().value(p_key="logger.dir")
   loglevel = Configure.configure().value(p_key="logger.level")
   logunit = Configure.configure().value(p_key="logger.keepUnit")
   loginterval = Configure.configure().value(p_key="logger.keepInterval")
   logcount = Configure.configure().value(p_key="logger.keepCount")
       
   log_fmt = '%(asctime)s\tFile \"%(filename)s\"%(levelname)s: %(message)s'
   formatter = logging.Formatter(log_fmt)
   log_file_handler = TimedRotatingFileHandler(filename=logdir, when=logunit, interval=loginterval, backupCount=logcount)
   log_file_handler.suffix = "%Y-%m-%d"
   #log_file_handler.extMatch = re.compile(r"^\d{4}-\d{2}-\d{2}.log$")
   log_file_handler.setFormatter(formatter)
   logging.basicConfig(level=self._level[loglevel.upper()])
   Logger.logger = logging.getLogger()
   Logger.logger.addHandler(log_file_handler)
   
   logging.getLogger('apscheduler').setLevel(logging.ERROR)
Пример #12
0
 def __init__(self, p_schedule_server, p_leader):
     self._schedule_server = p_schedule_server
     self._leader = p_leader
     #self._scheduler = self.getSchedule()
     #     reg = Configure.configure().value(p_key="scheduler.worknodes.register.interval")
     #     hea = Configure.configure().value(p_key="scheduler.worknodes.health.interval")
     self._register_instance = None
     self._register_task = self.generateRegisterTask(
         p_interval=Configure.configure().value(
             p_key="worknode.registerInterval"))
Пример #13
0
  def __init__(self, p_queue, p_request_queue):
    #Process.__init__(self)  
    threading.Thread.__init__(self) 
    self._interval = Configure.configure().value("headless.webdriver.browserIdleTimeMonitorInterval")
    self._max_idle_time = Configure.configure().value("headless.webdriver.maxBrowserIdleTime")
    
    self._alertMinAvailableNum = Configure.configure().value("headless.webdriver.alertMinAvailableNum")
    self._alertMaxAvailableNum = Configure.configure().value("headless.webdriver.alertMaxAvailableNum")
    self._monitorMinAvailableNum = Configure.configure().value("headless.webdriver.monitorMinAvailableNum")
    self._monitorMaxAvailableNum = Configure.configure().value("headless.webdriver.monitorMaxAvailableNum")
    
    self._iniBrowserNum = Configure.configure().value("headless.webdriver.iniBrowserNum")
    self._iniWinHeight = Configure.configure().value("headless.webdriver.iniBrowserWinHeight")
    self._iniWinWidth = Configure.configure().value("headless.webdriver.iniBrowserWinWidth")
    self._driver_path = Configure.configure().value("headless.webdriver.path")

    self._alert_used_rate = 0.5
    #self._alert_job_interval = 5
    self._driver_queue = p_queue
    self._request_queue = p_request_queue
Пример #14
0
    def __init__(self, p_addr, p_node_name, p_monitor):
        self._node_id = p_node_name
        self._host = p_addr
        self._port = Configure.configure().value("server.nodeServer.port")

        self._max_working_proc_num = Configure.configure().value(
            "worknode.maxWorkerNum")
        self._work_queue = Manager().Queue(
            Configure.configure().value("worknode.mainWorkQueueSize")
        )  #queue.Queue(Configure.configure().value("worknode.mainWorkQueueSize"))
        self._monitor = p_monitor

        #self._processing_sid_ary = []
        self._events = {"work": self.accept}

        for i in range(self._max_working_proc_num):
            mate = Mate(p_leader=self, p_queue=self._work_queue)
            #       mate.daemon = True
            mate.start()
            self._monitor.newProc(p_proc=mate)
Пример #15
0
 def listen(p_name, p_prefix, p_handler):
     port = Configure.configure().value(p_key=p_prefix + ".port")
     #     sbuf = Configure.configure().value(p_key=p_prefix+".sendBufferSize")
     #     rbuf = Configure.configure().value(p_key=p_prefix+".recvBufferSize")
     #     delimiter = Configure.configure().value(p_key=p_prefix+".delimiter")
     #     s=Server(p_port=port, p_callback=p_handler, p_delimiter=delimiter, p_recv_buf=rbuf)
     #     s.listen()
     server = SimpleTcpServer(p_name=p_name, p_callback=p_handler)
     server.listen(port)
     server.start()
     print("Server[" + p_name + "] starts at " + str(port) + "...")
Пример #16
0
  def fuckup(p_command=None):
    start = datetime.datetime.now()
    Main.rootdir = os.path.abspath('.')
    manager = Manager()
    
    #Initialize application configure
    filename = "application-config.yml"
    Configure.load(p_dir=Main.rootdir+"/"+filename, p_command=p_command)
        
    #Initialize log    
    Logger()
    Logger.getLogger().info("Web Driver Pool Launching......")
    
    #Initialize driver pool
    driver_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxBrowserNum"))
    request_queue = queue.Queue(Configure.configure().value("headless.webdriver.maxRequestAcceptNum"))
    #Manager().Queue(Configure.configure().value("headless.webdriver.maxBrowserNum"))

    Main.webDriverContainer = WebDriverContainer( p_queue = driver_queue, p_request_queue = request_queue )
    Main.webDriverContainer.run()
    
    #Main.pooledWebDriverManager = PooledWebDriverManager(p_queue = queue)
    #Main.pooledWebDriverManager.start()
    end = datetime.datetime.now()
    duration = (start-end).seconds
    Logger.getLogger().info("Web Driver Pool Launched after %d seconds"%(duration))
    
    try:
      delimiter = Configure.configure().value("server.webdriverServer.delimiter")
      deary = delimiter.split('\\x')
      #print ("delimiter's array: ", deary)
      destr = ''
      for i in range(len(deary)):
        if deary[i] != '':
          de = chr(int(deary[i],16))
          destr = de + destr  
      StreamHandler.startlisten(p_name="Headless-Webdriver-Server", p_prefix="server.webdriverServer", p_queue=request_queue, p_delimiter=destr)
      #tornado.ioloop.IOLoop.current().start()
    except (KeyboardInterrupt, SystemExit):
      pass
Пример #17
0
 def check(self):
   drivernum = self._driver_queue.qsize()
   requestnum = self._request_queue.qsize() 
   print ("current driver num is %d, waiting request num is %d"%(drivernum, requestnum))
   if round(requestnum/drivernum, 4) >= 0.5:
     print ("waiting request num is half of driver num")  
     for i in range(self._iniBrowserNum):
       try:  
         timeout = int(Configure.configure().value("headless.webdriver.addedNewDriverProcessAliveTimeout")) + i  
         proc = DriverProcess(p_request_queue=self._request_queue, p_alive_timeout=timeout)
         self._driver_queue.put(proc, block=False)
         proc.start()    
       except:
         pass   
Пример #18
0
    def fuckup(p_command=None):
        Main.rootdir = os.path.abspath('.')

        #Initialize application configure
        filename = "application-config.yml"
        Configure.load(p_dir=Main.rootdir + "/" + filename,
                       p_command=p_command)

        nodename = Configure.configure().value("worknode.name")
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            s.connect(('8.8.8.8', 80))
            Main.ipAddr = s.getsockname()[0]
        finally:
            s.close()

        #Initialize log
        Logger()

        #Initialize elasticsearch client
        Main.es_client = ESHandler()

        #Initialize worker monitor
        monitor = MultiProcessJobWatcher()
        executors = {
            'default': ThreadPoolExecutor(1),
            'processpool': ProcessPoolExecutor(1)
        }
        job_defaults = {'coalesce': True, 'max_instances': 1}
        mosche = BackgroundScheduler(executors=executors,
                                     job_defaults=job_defaults,
                                     timezone=utc)
        mosche.add_job(monitor,
                       'interval',
                       seconds=Configure.configure().value(
                           "worknode.workerMonitorInterval"))

        #Initialize worker leader
        leader = Leader(p_addr=Main.ipAddr,
                        p_node_name=nodename,
                        p_monitor=monitor)

        #Initialize node register and health info report schedule
        scheduleserveraddr = Configure.configure().value(
            "server.healthServer.host")
        scheduleserverport = Configure.configure().value(
            "server.healthServer.port")
        scheduleserver = {
            "host": scheduleserveraddr,
            "port": scheduleserverport
        }
        Main.communicator = Communicator(p_schedule_server=scheduleserver,
                                         p_leader=leader)

        #Initialize node job accept service
        ServerWrapper.listen(p_name=nodename,
                             p_prefix="server.nodeServer",
                             p_handler=leader)
        tornado.ioloop.IOLoop.current().start()

        try:
            # This is here to simulate application activity (which keeps the main thread alive).
            while True:
                time.sleep(2)
        except (KeyboardInterrupt, SystemExit):
            # Not strictly necessary if daemonic mode is enabled but should be done if possible
            parellelSchedule.shutdown()
Пример #19
0
 def __init__(self):
   self._max_working_time_per_worker = Configure.configure().value("worknode.maxWorkerNum")
   self._proc_pool = {}