Пример #1
0
 def crawl(self, args, browser):
     source = args['source']
     list_page_url = args['list_page_url']
     try:
         appstore_modual = __import__(source)
     except ImportError:
         util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores")
         return False
     appstore_class = appstore_modual.Store()
     if self.url_rd.exists(util.get_MD5(list_page_url)):
         util.write_log_warn(self.lock, self.logger, "list_page_url:" + list_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT))
         return False
     browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT
     util.browser_get_url(browser, list_page_url)
     # appstore_class.scroll_down(browser)
     check_more_count = appstore_class.click_checkmore(browser)
     util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + "点击checkmore" + str(check_more_count) + "次后找不到checkmore")
     detail_urls = appstore_class.get_detail_urls(browser)
     if len(detail_urls) == 0:
         util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + " 找到0个app")
     self.push_detail_page_message(args, detail_urls)
     next_list_page = appstore_class.click_nextpage(browser)
     if next_list_page:
         self.push_list_page_message(args, next_list_page)
     util.write_log_info(self.lock, self.logger,"SUCCESS : list_page_url: %s crawled"%(list_page_url))
     self.url_rd.set(util.get_MD5(list_page_url),1)
     self.url_rd.expire(util.get_MD5(list_page_url), config.CRAWLED_URL_REDIS_TIMEOUT)
     return True
Пример #2
0
 def work_process(self, browser):
     util.write_log_info(self.lock, self.logger, 'new detail_page_handler process started')
     while 1:
         try:
             message = self.rd.brpop([config.HIGH_DETAIL_PAGE_HANDLE_QUEUE, config.LOW_DETAIL_PAGE_HANDLE_QUEUE],0)
             util.write_log_info(self.lock, self.logger, 'received new nessage: %s'%(message[1]))
             self.do_work(message[1], browser)
         except Exception:
             import traceback
             util.write_log_error(self.lock, self.logger,"detail_page_handler process catched exception." + traceback.print_exc())
Пример #3
0
 def work_process(self):
     util.write_log_info(self.lock, self.logger, 'new tmpfile_hander process started')
     while 1:
         try:
             message = self.rd.brpop([config.HIGH_FILE_HANDLE_QUEUE, config.LOW_FILE_HANDLE_QUEUE],0)
             util.write_log_info(self.lock, self.logger, 'received new nessage: %s'%(message[1]))
             self.do_work(message[1])
         except Exception:
             import traceback
             util.write_log_error(self.lock, self.logger,"tmpfile_hander process catched exception." + traceback.print_exc())
Пример #4
0
    def work_thread(self):
        util.write_log_info(self.lock, self.logger, "new downloader thread started")
        while 1:
            try:
                message = self.rd.brpop([config.HIGH_DOWNLOAD_QUEUE, config.LOW_DOWNLOAD_QUEUE], 0)
                util.write_log_info(self.lock, self.logger, "received new nessage: %s" % (message[1]))
                self.do_work(message[1])
            except Exception:
                import traceback

                util.write_log_error(
                    self.lock, self.logger, "downloader thread catched exception." + traceback.print_exc()
                )
Пример #5
0
 def normal(self, args):
     session = self.prepare_download_session(args["download_cookie_producer"], args["request_headers"])
     download_result, download_redirect_urls = self.download(session, args)
     if download_result == "success":
         util.write_log_info(
             self.lock,
             self.logger,
             "SUCCESS : download_url: %s , file_name: %s " % (args["download_url"], args["file_name"]),
         )
         download_result = True
     else:
         util.write_log_error(
             self.lock, self.logger, "FAIL : download_url: %s ; reason: %s" % (args["download_url"], download_result)
         )
         download_result = False
     return download_result, download_redirect_urls
Пример #6
0
def main(argv):
    logger = None
    if config.LOGGER_ENABLED:
        logger = util.initLog(config.LOGGER_NAME_FILE_HANDLER)
    multiprocessing.current_process().name = 'tmpfile_hander_process_main'
    tmpfile_hander = Tmpfile_hander(logger = logger, host = config.REDIS_HOST, port = config.REDIS_PORT)
    util.write_log_info(tmpfile_hander.lock, tmpfile_hander.logger,"[+]start tmpfile_hander...")
    tmpfile_hander.launch()
    try:
        content=raw_input()
        while content != 'exit':
            content=raw_input()
    except KeyboardInterrupt:
        pass
    util.write_log_info(tmpfile_hander.lock, tmpfile_hander.logger,"user determined tmpfile_hander")
    tmpfile_hander.quit()
Пример #7
0
 def work_thread(self):
     util.write_log_info(self.lock, self.logger,
                         'new downloader thread started')
     while 1:
         try:
             message = self.rd.brpop(
                 [config.HIGH_DOWNLOAD_QUEUE, config.LOW_DOWNLOAD_QUEUE], 0)
             util.write_log_info(self.lock, self.logger,
                                 'received new nessage: %s' % (message[1]))
             self.do_work(message[1])
         except Exception:
             import traceback
             util.write_log_error(
                 self.lock, self.logger,
                 'downloader thread catched exception.' +
                 traceback.print_exc())
Пример #8
0
 def work_process(self):
     util.write_log_info(self.lock, self.logger,
                         'new tmpfile_hander process started')
     while 1:
         try:
             message = self.rd.brpop([
                 config.HIGH_FILE_HANDLE_QUEUE, config.LOW_FILE_HANDLE_QUEUE
             ], 0)
             util.write_log_info(self.lock, self.logger,
                                 'received new nessage: %s' % (message[1]))
             self.do_work(message[1])
         except Exception:
             import traceback
             util.write_log_error(
                 self.lock, self.logger,
                 "tmpfile_hander process catched exception." +
                 traceback.print_exc())
Пример #9
0
 def normal(self, args):
     session = self.prepare_download_session(
         args['download_cookie_producer'], args['request_headers'])
     download_result, download_redirect_urls = self.download(session, args)
     if download_result == 'success':
         util.write_log_info(
             self.lock, self.logger,
             "SUCCESS : download_url: %s , file_name: %s " %
             (args['download_url'], args['file_name']))
         download_result = True
     else:
         util.write_log_error(
             self.lock, self.logger,
             "FAIL : download_url: %s ; reason: %s" %
             (args['download_url'], download_result))
         download_result = False
     return download_result, download_redirect_urls
Пример #10
0
def main(argv):
    import socket
    socket.setdefaulttimeout(config.SOCKET_TIMEOUT)
    logger = None
    if config.LOGGER_ENABLED:
        logger = util.initLog(config.LOGGER_NAME_DETAIL_PAGE_HANDLER)
    multiprocessing.current_process().name = 'detail_page_handler_process_main'
    detail_page_handler = Detail_page_handler(logger = logger, host = config.REDIS_HOST, port = config.REDIS_PORT)
    util.write_log_info(detail_page_handler.lock, detail_page_handler.logger,"[+]start detail_page_handler...")
    detail_page_handler.launch()
    try:
        content=raw_input()
        while content != 'exit':
            content=raw_input()
    except KeyboardInterrupt:
        pass
    util.write_log_info(detail_page_handler.lock, detail_page_handler.logger,"user determined detail_page_handler")
    detail_page_handler.quit()
Пример #11
0
def main(argv):
    import socket

    socket.setdefaulttimeout(config.SOCKET_TIMEOUT)
    logger = None
    if config.LOGGER_ENABLED:
        logger = util.initLog(config.LOGGER_NAME_DOWNLOADER)
    threading.current_thread().setName("downloader_thread_main")
    downloader = Downloader(logger=logger, host=config.REDIS_HOST, port=config.REDIS_PORT)
    util.write_log_info(downloader.lock, downloader.logger, "[+]start downloader...")
    downloader.launch()
    try:
        content = raw_input()
        while content != "exit":
            content = raw_input()
    except KeyboardInterrupt:
        pass
    util.write_log_info(downloader.lock, downloader.logger, "user determined downloader")
    downloader.quit()
Пример #12
0
def main(argv):
    logger = None
    if config.LOGGER_ENABLED:
        logger = util.initLog(config.LOGGER_NAME_FILE_HANDLER)
    multiprocessing.current_process().name = 'tmpfile_hander_process_main'
    tmpfile_hander = Tmpfile_hander(logger=logger,
                                    host=config.REDIS_HOST,
                                    port=config.REDIS_PORT)
    util.write_log_info(tmpfile_hander.lock, tmpfile_hander.logger,
                        "[+]start tmpfile_hander...")
    tmpfile_hander.launch()
    try:
        content = raw_input()
        while content != 'exit':
            content = raw_input()
    except KeyboardInterrupt:
        pass
    util.write_log_info(tmpfile_hander.lock, tmpfile_hander.logger,
                        "user determined tmpfile_hander")
    tmpfile_hander.quit()
Пример #13
0
 def crawl(self, args):
     file_name = args['file_name']
     target_folder = args['target_folder']
     apk_info = self.prepare_apk_info(args)
     util.makeDirs(self.lock, target_folder)
     if not util.verify_apk(file_name):
         util.write_log_warn(self.lock, self.logger, "FAILED :  %s jarsigner faild."%(file_name))
         os.remove(file_name)
         return False
     md5 = util.get_file_MD5(file_name)
     apk_info['MD5'] = md5
     target_name = target_folder + 'cell-' + md5[:2] + '/' + md5 + '.apk'
     if os.path.exists(target_name):
         os.remove(file_name)
     else:
         apk_info['new'] = 'True'
         os.rename(args['file_name'], target_name)
     util.writeApkInfoFile(self.lock, target_folder,apk_info)
     util.writeRecord(self.lock, target_folder,apk_info)
     util.write_log_info(self.lock, self.logger, "SUCCESS : %s handle success"%(args['file_name']))
     return True
Пример #14
0
def main(argv):
    import socket
    socket.setdefaulttimeout(config.SOCKET_TIMEOUT)
    logger = None
    if config.LOGGER_ENABLED:
        logger = util.initLog(config.LOGGER_NAME_DOWNLOADER)
    threading.current_thread().setName('downloader_thread_main')
    downloader = Downloader(logger=logger,
                            host=config.REDIS_HOST,
                            port=config.REDIS_PORT)
    util.write_log_info(downloader.lock, downloader.logger,
                        "[+]start downloader...")
    downloader.launch()
    try:
        content = raw_input()
        while content != 'exit':
            content = raw_input()
    except KeyboardInterrupt:
        pass
    util.write_log_info(downloader.lock, downloader.logger,
                        "user determined downloader")
    downloader.quit()
Пример #15
0
 def crawl(self, args):
     file_name = args['file_name']
     target_folder = args['target_folder']
     apk_info = self.prepare_apk_info(args)
     util.makeDirs(self.lock, target_folder)
     if not util.verify_apk(file_name):
         util.write_log_warn(self.lock, self.logger,
                             "FAILED :  %s jarsigner faild." % (file_name))
         os.remove(file_name)
         return False
     md5 = util.get_file_MD5(file_name)
     apk_info['MD5'] = md5
     target_name = target_folder + 'cell-' + md5[:2] + '/' + md5 + '.apk'
     if os.path.exists(target_name):
         os.remove(file_name)
     else:
         apk_info['new'] = 'True'
         os.rename(args['file_name'], target_name)
     util.writeApkInfoFile(self.lock, target_folder, apk_info)
     util.writeRecord(self.lock, target_folder, apk_info)
     util.write_log_info(
         self.lock, self.logger,
         "SUCCESS : %s handle success" % (args['file_name']))
     return True
Пример #16
0
        browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT
        util.browser_get_url(browser, detail_page_url)
        download_url = appstore_class.get_download_url(browser)
        if download_url is None:
            util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url")
            return False
        out_json_message = appstore_class.make_detail2download_json_message(browser)
        out_json_message['detail_url'] = detail_page_url
        out_json_message['download_url'] = download_url
        if args['level'] = 'high':
            out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE
        else:
            out_json_message['next_queue'] = config.LOW_DOWNLOAD_QUEUE
        out_json_message['level'] = args['level']
        out_json_message['message_type'] = args['message_type']
        util.write_log_info(self.lock, self.logger,"SUCCESS : detail_page_url: %s crawled"%(detail_page_url))
        self.url_rd.set(util.get_MD5(detail_page_url),1)
        self.url_rd.expire(util.get_MD5(detail_page_url), config.CRAWLED_URL_REDIS_TIMEOUT)
        return self.push_message(out_json_message)

    def normal(self, args, browser):
        util.write_log_warn(self.lock, self.logger, '"normal" message type did nothing.')
        return True

    def do_work(self, message, browser):
        util.random_sleep()
        args = self.prepare_args(message)
        if args is None:
            return
        return eval('self.'+args['message_type'])(args, browser)