Пример #1
0
    def start(self):
        global __g_userSpace
        global __g_memDB

        __g_memDB = MemDB()
        __g_userSpace = UserSpace(__g_memDB)

        application = tornado.web.Application([
            (RequestMapping.register_node, RegisterHandler),
            (RequestMapping.fetch_dashboard, DashBoardHandler),
        ])
        application.listen(
            ApplicationProperties.configure(
                p_key='application.controller.register.server.port'))
        print('Application listeming...[Port:%s]' %
              (ApplicationProperties.configure(
                  p_key='application.controller.register.server.port')))

        workers = gen.multi([worker(idx) for idx in range(1)])

        __g_userSpace.init()

        ioLoop = tornado.ioloop.IOLoop.current()
        #print(ioLoop)
        ioLoop.start()
Пример #2
0
    async def post(self):
        if "action" not in self.args or self.args["action"] != "fetchip":
            for key, values in global_ip_dic.items():
                print("%s - %s" % (key, values))
            self.write({"msg": "unknown action"})
            return

        while True:
            ips = list(global_ip_dic.keys())
            if len(ips) < 1:
                self.write({"msg": "no ip"})
                break
            #print("index %d arrived"%(int(self.args["index"])))
            getint = random.randint(1, len(ips))
            ip = ips[getint - 1]
            maxtimes = int(
                ApplicationProperties.configure(
                    "application.shared.ippool.maxUseTimes"))
            expireOffset = int(
                ApplicationProperties.configure(
                    "application.shared.ippool.expireOffset"))
            now = datetime.now()
            expire = global_ip_dic[ip][1]
            times = global_ip_dic[ip][2]
            if ((expire - now).seconds / 60 < expireOffset) or (times >
                                                                maxtimes):
                print("ip[%s] is out of work - %s" % (ip, datetime.now()))
                print(global_ip_dic[ip])
                global_ip_dic.pop(ip)
            else:
                global_ip_dic[ip][2] = global_ip_dic[ip][2] + 1
                self.write({"ip": ip, "port": global_ip_dic[ip][0]})
                break
Пример #3
0
 def __init__(self):
     self._workdir = ApplicationProperties.configure(
         'application.storage.filesystem.download.tempdir')
     self._contentdir = ApplicationProperties.configure(
         'application.storage.filesystem.download.tempdir') + "/html"
     self._staticdir = ApplicationProperties.configure(
         'application.storage.filesystem.download.staticdir')
Пример #4
0
 def start(self):
     print('Application listeming...[Port:%s]' %
           (ApplicationProperties.configure(
               p_key='application.bluewhale.server.port')))
     application = Application([
         (RequestMapping.whale_collect_start, WhaleHandler),
         (RequestMapping.whale_collect_write, WhaleWriter),
     ])
     application.listen(
         ApplicationProperties.configure(
             p_key='application.bluewhale.server.port'))
     IOLoop.current().start()
Пример #5
0
async def refreship():
    maxtimes = int(
        ApplicationProperties.configure(
            "application.shared.ippool.maxUseTimes"))
    expireOffset = int(
        ApplicationProperties.configure(
            "application.shared.ippool.expireOffset"))
    now = datetime.now()
    for idx, rec in enumerate(global_ip_pool):
        expire = rec[2]
        times = rec[3]
        if ((expire - now) / 60 < expireOffset) or (times > maxtimes):
            del (global_ip_pool[idx])
Пример #6
0
async def async_fetch_https(req):
    url = req["url"]
    mid = req["materialId"]
    print('[%s] async_fetch_https | url[%s]' %
          (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), url))
    header = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) VAR/1.0.0.1"
    }
    datadir = ApplicationProperties.configure(
        'application.storage.filesystem.download.tempdir')
    http_client = httpclient.AsyncHTTPClient()
    try:
        response = await http_client.fetch(
            url,
            method='GET',
            headers=header,
            connect_timeout=ApplicationProperties.configure(
                'application.exchange.server.download.connectTimeOut'),
            request_timeout=ApplicationProperties.configure(
                'application.exchange.server.download.requestTimeOut'),
            validate_cert=False)
    except Exception as e:
        print("async_fetch_https Error: %s" % e)
    else:
        selector = Selector(text=response.body)
        #print(response.body)
        metaChr = selector.xpath('//meta/@charset')
        charset = 'utf-8'
        if metaChr:
            charset = metaChr.get()
        else:
            metaChr = selector.xpath(
                "//meta[contains('http-equiv','Content-Type')]/@content")
            if metaChr:
                charsetM = re.search(r'(?<=charset=)\w+',
                                     metaChr.get().lower())
                if charsetM:
                    charset = charsetM.group(0)
        print('Page encoding charset is %s' % (charset))

        seqno = str(mid) + datetime.now().strftime('%Y%m%d%H%M%S')
        req["writeTime"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        req["seqno"] = seqno
        req["charset"] = charset
        ini = open(datadir + "/" + seqno + ".ini", 'w+')
        ini.write(json.dumps(req) + "\n")

        fp = open(datadir + "/html/" + seqno, 'wb+')
        fp.write(response.body)
Пример #7
0
async def sjob(pid,scannerId,startSeq,endSeq):
    conn,cur = PgConnectionPool.getConn()
    try:
        print("job fired...%s,%s,%d,%d"%(pid,scannerId,startSeq,endSeq))
        
        limit = int(ApplicationProperties.configure("application.storage.access.page.limit"))
        offset = 0
        flag = limit
        while flag == limit:
            flag = 1
            cur.execute('SELECT * FROM t_uc_material__%s where status = 2 and materialId >= %d and materialId < %d limit %d offset %d'%(pid,startSeq,endSeq,limit,offset))
            results=cur.fetchall()
            print(results)
            for idx, r in enumerate(results):
                #print(r[2])
                #definition = json.loads(r[2])
                #print("task definition:%s"%(definition))
                type = r[2]["type"]
                addr = r[2]["addr"]
                secondDomain = re.search(r"[A-Za-z0-9\-]+\.com|[A-Za-z0-9\-]+\.edu.cn|[A-Za-z0-9\-]+\.cn|[A-Za-z0-9\-]+\.com.cn|[A-Za-z0-9\-]+\.org|[A-Za-z0-9\-]+\.net|[A-Za-z0-9\-]+\.tv|[A-Za-z0-9\-]+\.vip|[A-Za-z0-9\-]+\.cc|[A-Za-z0-9\-]+\.gov.cn|[A-Za-z0-9\-]+\.gov|[A-Za-z0-9\-]+\.edu|[A-Za-z0-9\-]+\.biz|[A-Za-z0-9\-]+\.net.cn|[A-Za-z0-9\-]+\.org.cn",addr.lower())
                #if type == 1 or type == 9:
                    #todo: send to massCollect
                    #{materialId, seqno, definition}
                    #None
                #else:
                await inq.put({"pid":pid,"materialId":r[0],"uuid":r[1],"type":type,"url":addr,"domain":secondDomain.group(),"definition":r[2]});
                flag=flag+idx
                
            offset=offset+limit
    except Exception as e:
        print(e)
    finally:
        PgConnectionPool.release(conn, cur)        
Пример #8
0
 async def post(self):
     fp = open(ApplicationProperties.configure(
         "application.storage.filesystem.uc.rootdir") + "/data",
               'w+',
               encoding="utf-8")
     fp.write(json.dumps(WorkEnv.current_result, ensure_ascii=False))
     fp.flush()
     fp.close()
     print("Write done!")
Пример #9
0
 def initialize_env():
     phases = WorkEnv.Environment.keys()
     #scheduler = TornadoScheduler()
     for idx, p in enumerate(phases):
         defi = WorkEnv.Environment[p]
         ini = {}
         ini["phase"] = idx + 1
         fp = open(ApplicationProperties.configure(
             "application.storage.filesystem.uc.rootdir") +
                   "/temp_data/phase" + str(ini["phase"]),
                   'w+',
                   encoding="utf-8")
         if "pagination" in defi:
             bridgeq = queues.Queue()
             ini["pagination"] = defi["pagination"]
             ini["selectors"] = defi["selectors"]
             interim = ini["pagination"]["interim"]
             print("gen pworkers")
             pworkers = gen.multi([
                 pagination(
                     ini["pagination"], defi["srcq"], bridgeq, "phase[" +
                     str(ini["phase"]) + "]-pagination[" + str(_) + "]")
                 for _ in range(p_concurrency)
             ])
             #print(pworkers)
             print(defi["srcq"])
             print(bridgeq)
             #scheduler.add_job(pagination,'interval', seconds=interim, args=(ini, defi["srcq"], bridgeq))
             cworkers = gen.multi([
                 worker(
                     Crawler(ini), bridgeq, defi["distq"], "phase[" +
                     str(ini["phase"]) + "]-crawler[" + str(_) + "]", fp)
                 for _ in range(c_concurrency)
             ])
             print("gen cworkers")
             print(cworkers)
             print(bridgeq)
             print(defi["distq"])
         else:
             ini["selectors"] = defi["selectors"]
             dq = None
             if "distq" in defi:
                 dq = defi["distq"]
                 print(defi["distq"])
             cworkers = gen.multi([
                 worker(
                     Crawler(ini), defi["srcq"], dq, "phase[" +
                     str(ini["phase"]) + "]-crawler[" + str(_) + "]", fp)
                 for _ in range(c_concurrency)
             ])
             print("gen cworkers")
             print(cworkers)
             print(defi["srcq"])
Пример #10
0
    def start(self):
        self._listenPort = ApplicationProperties.configure(
            "application.exchange.server.port")
        self._mode = ApplicationProperties.configure(
            "application.exchange.server.needLimitRequest")

        if self._mode == "False":
            self._server = Server()
            print("Nolimited Server starting...")

        elif self._mode == "True":
            self._timelineSche = TornadoScheduler()
            _scheduler.add_job(timeline, 'interval', seconds=5)
            self._timelineSche.start()
            self._server = LimitServer()
            print("Limited Server starting...")

        self._server.listen(self._listenPort)
        #if q:
        #    q.put_nowait(self)

        IOLoop.current().start()
Пример #11
0
    def start(self):
        self._scheduler = TornadoScheduler()
        self._scheduler.add_job(
            fetchips,
            'interval',
            seconds=int(
                ApplicationProperties.configure(
                    "application.shared.ippool.fetchInterval")))
        self._scheduler.start()

        print('Application listeming...[Port:%s]' %
              (ApplicationProperties.configure(
                  p_key='application.shared.ippool.server.port')))
        application = Application([
            (RequestMapping.dynamic_ip_assign, IPAssign),
        ])
        application.listen(
            ApplicationProperties.configure(
                p_key='application.shared.ippool.server.port'))

        #IOLoop.current().start()
        IOLoop.current().run_sync(fetchips)
        IOLoop.current().start()
Пример #12
0
async def fetchips():
    flag = int(
        ApplicationProperties.configure(
            "application.shared.ippool.refreshFlagNum"))
    if len(global_ip_dic) < flag:
        print("job fired, current ip num %d, need min num is %d" %
              (len(global_ip_dic), flag))
        #for key,values in  global_ip_dic.items():
        #    print ("%s - %s"%(key,values))
        response = await fetch(
            'http://api.wandoudl.com/api/ip?app_key=8fbe2e6d52ee4657e89580276d698abc&pack=210570&num=10&xy=1&type=2&mr=1&'
        )
        print(response)
        if response:
            results = json.loads(response)
            if "data" in results:
                iplist = results["data"]
                for ip in iplist:
                    ts = datetime.strptime(ip["expire_time"],
                                           '%Y-%m-%d %H:%M:%S')
                    #global_ip_pool.append([ip["ip"],  ip["port"], ts, 0, ])
                    global_ip_dic[ip["ip"]] = [ip["port"], ts, 0]
                print(global_ip_dic)
Пример #13
0
    def load(self):
        #jobstores = {
        #    'mongo': {'type': 'mongodb'},
        #   'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
        #}
        executors = {'default': {'type': 'threadpool', 'max_workers': 5}}
        job_defaults = {'coalesce': False, 'max_instances': 3}
        self._massServerHost = ApplicationProperties.configure(
            "application.mass.server.host")
        self._massServerPort = ApplicationProperties.configure(
            "application.mass.server.port")

        print('Application listeming...[Port:%s]' %
              (ApplicationProperties.configure(
                  p_key='application.collector.server.port')))
        application = tornado.web.Application([
            (RequestMapping.adjust_collect_job, JobHandler),
            (RequestMapping.fetch_collect_dashboard, DashBoardHandler),
        ])
        application.listen(
            ApplicationProperties.configure(
                p_key='application.collector.server.port'))

        self._scheduler = TornadoScheduler()
        self._scheduler.add_job(
            asyncjob,
            'interval',
            seconds=int(
                ApplicationProperties.configure(
                    "application.collector.schedule.intervalSeconds")),
            args=(ApplicationProperties.configure(
                'application.storage.filesystem.download.tempdir'),
                  ApplicationProperties.configure(
                      'application.storage.filesystem.download.tempdir') +
                  "/html", Collector()))
        #self._scheduler.configure(executors=executors, job_defaults=job_defaults)
        self._scheduler.start()

        tornado.ioloop.IOLoop.current().run_sync(self.send)
Пример #14
0
    def start(self):
        global __g_scheduler
        __g_scheduler = TornadoScheduler()
        #ApplicationProperties.populate(p_command=self._command)
        
        scannerId = ApplicationProperties.configure("scannerId")
        print("Ready initialize scanner(id=%s)"%(scannerId))
        conn,cur = PgConnectionPool.getConn()
        try:
            
            sql = 'SELECT p.pid,p.maxSeq,p.minSeq,p.splitNum,s.intervalSecs FROM t_uc_material_partition_scan s inner join t_uc_material_partition p on s.pid=p.pid where s.scannerId = %s'
            data = (scannerId)
            cur.execute(sql, data)
            results=cur.fetchall()
            print(results)
            record = results[0]
            
            splitNum = record[3]
            maxSeq = record[1]
            minSeq = record[2]
            intervalSecs = int(record[4])
            pid = record[0]
            fragment = math.ceil((maxSeq-minSeq+1)/splitNum)
            sp = minSeq
            for i in range(splitNum):
                fromI = sp
                sp = sp + fragment
                print("add scanner, from %d to %d, interval %d seconds, partition id %s."%(fromI,sp,intervalSecs,pid))
                #memS.insert(startSeq=fromI,endSeq=sp,intervalSecs=__g_properties.getProperty("application.scanner.schedule.intervalSeconds"),scannerId=scannerId,pid=unassiP["pid"])
                __g_scheduler.add_job(sjob,'interval', seconds=intervalSecs, args=(pid,scannerId,fromI,sp))
            __g_scheduler.start()
                    
        #except Exception as e:
        #    print(e)
        finally:
            PgConnectionPool.release(conn, cur)

        self._exchangeServerHost = ApplicationProperties.configure("application.exchange.server.host")
        self._exchangeServerPort = ApplicationProperties.configure("application.exchange.server.port")
        '''
        jobstores = {
            'mongo': {'type': 'mongodb'},
            'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')
        }
        executors = {
            'default': {'type': 'threadpool', 'max_workers': 5}
        }
        job_defaults = {
            'coalesce': False,
            'max_instances': 1
        }
        '''
        '''
        for idx, sche in enumerate(p_schedule):
            sid = sche["scannerId"]+"#"+str(idx)
            _scheduler.add_job(sjob,'interval', seconds=_properties.getProperty("application.scanner.schedule.intervalSeconds"), args=(sche["pid"],sche["tableName"],sid,sche["startSeq"],sche["endSeq"]))
        '''
        
        
        #if q:
        #    q.put_nowait(self)
        #newLoop = tornado.ioloop.IOLoop.current()
        #print(newLoop)
        tornado.ioloop.IOLoop.current().run_sync(self.send)
Пример #15
0
        self._mode = None
        self._timelineSche = None

    def start(self):
        self._listenPort = ApplicationProperties.configure(
            "application.exchange.server.port")
        self._mode = ApplicationProperties.configure(
            "application.exchange.server.needLimitRequest")

        if self._mode == "False":
            self._server = Server()
            print("Nolimited Server starting...")

        elif self._mode == "True":
            self._timelineSche = TornadoScheduler()
            _scheduler.add_job(timeline, 'interval', seconds=5)
            self._timelineSche.start()
            self._server = LimitServer()
            print("Limited Server starting...")

        self._server.listen(self._listenPort)
        #if q:
        #    q.put_nowait(self)

        IOLoop.current().start()


if __name__ == '__main__':
    ApplicationProperties.populate(
        {"config": "/Users/apple/Documents/java/workspace/var-daemon"})
    Exchange().start()
Пример #16
0
    def run(self):
        urllib3.contrib.pyopenssl.inject_into_urllib3()
        start = time.time()
        fetching, fetched, dead = set(), set(), set()

        server = Server()
        server.listen(
            int(ApplicationProperties.configure(
                "application.mass.server.port")))
        print(
            "Server listening: %s" %
            (ApplicationProperties.configure("application.mass.server.port")))
        '''
        properties = ApplicationProperties()
        application = web.Application([ (RequestMapping.submit_fetch_url_task, MainHandler),
                                                                                       (RequestMapping.get_current_fetch_worker_queue_num, dashboard), ])
        application.listen(properties.getProperty(p_key='server.port'))
        '''
        async def fetch_url(task):
            definition = task["definition"]
            current_url = definition["addr"]
            if current_url in fetching:
                return

            print("fetching %s" % current_url)
            fetching.add(current_url)
            try:
                contents, urls = await get_links_from_url(definition)
            except Exception as e:
                print("Error: %s" % e)
            else:
                print("after fetching %s" % current_url)
                fetching.remove(current_url)
                print(contents, urls)
                if definition["level"] == 1:
                    if len(urls) > 0:
                        dataq[task["seqno"]]["queue"] = queues.Queue(len(urls))
                    dataq[task["seqno"]]["contents"] = contents
                    dataq[task["seqno"]]["childrennum"] = len(urls)
                    print("level 1 had done")
                    print(dataq)
                else:
                    if "queue" in dataq[task["seqno"]]:
                        print("child content had done")
                        print(contents)
                        q = dataq[task["seqno"]]["queue"]
                        q.put(contents)

                        if q.qsize() == dataq[task["seqno"]]["childrennum"]:
                            write(task, dataq[task["seqno"]])

                if "child" in definition:
                    for url in urls:
                        #newurls = [urljoin(url, remove_fragment(new_url)) for new_url in get_links(task["extractLinkSelector"], s)]
                        #for nu in newurls:
                        #subtask = copy.deepcopy(task)
                        subtask = {}
                        subtask["pid"] = task["pid"]
                        subtask["uuid"] = task["uuid"]
                        subtask["type"] = task["type"]
                        subtask["writeTime"] = task["writeTime"]
                        subtask["charset"] = task["charset"]
                        subtask["materialId"] = task["materialId"]
                        subtask["seqno"] = task["seqno"]
                        subtask["definition"] = {}
                        subtask["definition"]["addr"] = url  #["addr"]
                        #subtask["definition"]["parent"] = url["parent"]
                        subtask["definition"]["selectors"] = definition[
                            "child"]["selectors"]
                        subtask["definition"]["level"] = definition["child"][
                            "level"]
                        subtask["definition"]["wrap"] = definition["child"][
                            "wrap"]
                        if "child" in definition["child"]:
                            subtask["definition"]["child"] = definition[
                                "child"]["child"]
                        await inboundq.put(subtask)

                fetched.add(current_url)

            #for new_url in urls:
            # Only follow links beneath the base URL
            #if new_url.startswith(base_url):
            #    await q.put(new_url)
            #    await inboundq .put(new_url)

        async def worker():
            async for task in inboundq:
                if task is None:
                    return
                try:
                    if task["type"] == 1:
                        definition = task["definition"]
                        if definition["level"] == 1:
                            seqno = task["seqno"]
                            dataq[seqno] = {"queue": None}
                    await fetch_url(task)

                except Exception as e:
                    print("Exception: %s %s" % (e, url))
                    dead.add(url)
                finally:
                    inboundq.task_done()

        #await q.put(base_url)

        # Start workers, then wait for the work queue to be empty.
        workers = gen.multi([worker() for _ in range(concurrency)])
        ioloop.IOLoop.current().start()
Пример #17
0
                except Exception as e:
                    print("Exception: %s %s" % (e, url))
                    dead.add(url)
                finally:
                    inboundq.task_done()

        #await q.put(base_url)

        # Start workers, then wait for the work queue to be empty.
        workers = gen.multi([worker() for _ in range(concurrency)])
        ioloop.IOLoop.current().start()
        #io_loop = ioloop.IOLoop.current()
        #io_loop.run_sync(main)


if __name__ == '__main__':
    ApplicationProperties.populate(
        {"config": "/Users/apple/Documents/var/workspace/var-daemon"})
    PgConnectionPool.ini(
        host=ApplicationProperties.configure(
            "application.storage.postgres.connection.host"),
        port=ApplicationProperties.configure(
            "application.storage.postgres.connection.port"),
        user=ApplicationProperties.configure(
            "application.storage.postgres.connection.user"),
        password=ApplicationProperties.configure(
            "application.storage.postgres.connection.password"),
        database=ApplicationProperties.configure(
            "application.storage.postgres.connection.database"))

    LoadCollectors().run()
Пример #18
0
if __name__ == '__main__':
    props = {}
    if len(sys.argv) > 1:
        for idx in range(1, len(sys.argv)):
            arg = sys.argv[idx]
            if arg.startswith("-"):
                prop = arg[1:]
                pair = prop.split("=")
                props[pair[0]] = pair[1]

    print(props)
    #Main( p_command=props )
    module = None
    if "module" in props:
        ApplicationProperties.populate(props)
        PgConnectionPool.ini(
            host=ApplicationProperties.configure(
                "application.storage.postgres.connection.host"),
            port=ApplicationProperties.configure(
                "application.storage.postgres.connection.port"),
            user=ApplicationProperties.configure(
                "application.storage.postgres.connection.user"),
            password=ApplicationProperties.configure(
                "application.storage.postgres.connection.password"),
            database=ApplicationProperties.configure(
                "application.storage.postgres.connection.database"))

        module = props["module"]
        print("Run application's module [%s]" % (module))
        if module == "centricControl":