示例#1
0
 def prepare_args(self, args_str):
     try:
         args = json.loads(args_str)
     except (TypeError, ValueError):
         util.write_log_error(self.lock, self.logger, "can not parse %s to json args." % (args_str))
         return None
     if not "download_url" in args:
         util.write_log_error(self.lock, self.logger, "download_url not found.")
         return None
     if not "level" in args:
         args["level"] = "low"
     if not "method" in args:
         args["method"] = config.REQUESTS_ACTION_GET
     if not "data" in args:
         args["data"] = None
     if not "request_headers" in args:
         args["request_headers"] = config.REQUEST_HEADERS
     if not "download_cookie_producer" in args:
         args["download_cookie_producer"] = None
     if not "download_folder" in args:
         args["download_folder"] = config.DEFAULT_DOWNLOAD_FOLDER
     if not "download_file_name" in args:
         args["download_file_name"] = self.DEFAULT_DOWNLOAD_FILE_NAME()
     if not "download_file_type" in args:
         args["download_file_type"] = config.DEFAULT_DOWNLOAD_FILE_TYPE
     if not "target_folder" in args:
         args["target_folder"] = config.DEFAULT_FILE_HANDLE_FOLDER
     if not "message_type" in args:
         args["message_type"] = config.MESSAGE_TYPE_NORMAL
     return args
 def crawl(self, args, browser):
     source = args['source']
     list_page_url = args['list_page_url']
     try:
         appstore_modual = __import__(source)
     except ImportError:
         util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores")
         return False
     appstore_class = appstore_modual.Store()
     if self.url_rd.exists(util.get_MD5(list_page_url)):
         util.write_log_warn(self.lock, self.logger, "list_page_url:" + list_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT))
         return False
     browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT
     util.browser_get_url(browser, list_page_url)
     # appstore_class.scroll_down(browser)
     check_more_count = appstore_class.click_checkmore(browser)
     util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + "点击checkmore" + str(check_more_count) + "次后找不到checkmore")
     detail_urls = appstore_class.get_detail_urls(browser)
     if len(detail_urls) == 0:
         util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + " 找到0个app")
     self.push_detail_page_message(args, detail_urls)
     next_list_page = appstore_class.click_nextpage(browser)
     if next_list_page:
         self.push_list_page_message(args, next_list_page)
     util.write_log_info(self.lock, self.logger,"SUCCESS : list_page_url: %s crawled"%(list_page_url))
     self.url_rd.set(util.get_MD5(list_page_url),1)
     self.url_rd.expire(util.get_MD5(list_page_url), config.CRAWLED_URL_REDIS_TIMEOUT)
     return True
示例#3
0
 def prepare_args(self, args_str):
     try:
         args = json.loads(args_str)
     except (TypeError, ValueError):
         util.write_log_error(self.lock, self.logger,
                              'can not parse %s to json args.' % (args_str))
         return None
     if not 'download_url' in args:
         util.write_log_error(self.lock, self.logger,
                              'download_url not found.')
         return None
     if not 'level' in args:
         args['level'] = 'low'
     if not 'method' in args:
         args['method'] = config.REQUESTS_ACTION_GET
     if not 'data' in args:
         args['data'] = None
     if not 'request_headers' in args:
         args['request_headers'] = config.REQUEST_HEADERS
     if not 'download_cookie_producer' in args:
         args['download_cookie_producer'] = None
     if not 'download_folder' in args:
         args['download_folder'] = config.DEFAULT_DOWNLOAD_FOLDER
     if not 'download_file_name' in args:
         args['download_file_name'] = self.DEFAULT_DOWNLOAD_FILE_NAME()
     if not 'download_file_type' in args:
         args['download_file_type'] = config.DEFAULT_DOWNLOAD_FILE_TYPE
     if not 'target_folder' in args:
         args['target_folder'] = config.DEFAULT_FILE_HANDLE_FOLDER
     if not 'message_type' in args:
         args['message_type'] = config.MESSAGE_TYPE_NORMAL
     return args
示例#4
0
 def work_process(self, browser):
     util.write_log_info(self.lock, self.logger, 'new detail_page_handler process started')
     while 1:
         try:
             message = self.rd.brpop([config.HIGH_DETAIL_PAGE_HANDLE_QUEUE, config.LOW_DETAIL_PAGE_HANDLE_QUEUE],0)
             util.write_log_info(self.lock, self.logger, 'received new nessage: %s'%(message[1]))
             self.do_work(message[1], browser)
         except Exception:
             import traceback
             util.write_log_error(self.lock, self.logger,"detail_page_handler process catched exception." + traceback.print_exc())
示例#5
0
 def work_process(self):
     util.write_log_info(self.lock, self.logger, 'new tmpfile_hander process started')
     while 1:
         try:
             message = self.rd.brpop([config.HIGH_FILE_HANDLE_QUEUE, config.LOW_FILE_HANDLE_QUEUE],0)
             util.write_log_info(self.lock, self.logger, 'received new nessage: %s'%(message[1]))
             self.do_work(message[1])
         except Exception:
             import traceback
             util.write_log_error(self.lock, self.logger,"tmpfile_hander process catched exception." + traceback.print_exc())
示例#6
0
 def launch(self):
     while len(self.phantomjs_webdrivers)<config.MAX_DETAIL_PAGE_HANDLER_PROCESS_NUM:
         phantomJS_browser = util.start_phantomJS_browser()
         if phantomJS_browser:
             self.phantomjs_webdrivers.append(phantomJS_browser)
             process = multiprocessing.Process(target = self.work_process, args=(phantomJS_browser, ))
             process.daemon = True
             self.processes.append(process)
             process.start()
         else:
             util.write_log_error(self.lock, self.logger,"phantomJS start failed.")
示例#7
0
    def work_thread(self):
        util.write_log_info(self.lock, self.logger, "new downloader thread started")
        while 1:
            try:
                message = self.rd.brpop([config.HIGH_DOWNLOAD_QUEUE, config.LOW_DOWNLOAD_QUEUE], 0)
                util.write_log_info(self.lock, self.logger, "received new nessage: %s" % (message[1]))
                self.do_work(message[1])
            except Exception:
                import traceback

                util.write_log_error(
                    self.lock, self.logger, "downloader thread catched exception." + traceback.print_exc()
                )
示例#8
0
 def normal(self, args):
     session = self.prepare_download_session(args["download_cookie_producer"], args["request_headers"])
     download_result, download_redirect_urls = self.download(session, args)
     if download_result == "success":
         util.write_log_info(
             self.lock,
             self.logger,
             "SUCCESS : download_url: %s , file_name: %s " % (args["download_url"], args["file_name"]),
         )
         download_result = True
     else:
         util.write_log_error(
             self.lock, self.logger, "FAIL : download_url: %s ; reason: %s" % (args["download_url"], download_result)
         )
         download_result = False
     return download_result, download_redirect_urls
示例#9
0
 def work_thread(self):
     util.write_log_info(self.lock, self.logger,
                         'new downloader thread started')
     while 1:
         try:
             message = self.rd.brpop(
                 [config.HIGH_DOWNLOAD_QUEUE, config.LOW_DOWNLOAD_QUEUE], 0)
             util.write_log_info(self.lock, self.logger,
                                 'received new nessage: %s' % (message[1]))
             self.do_work(message[1])
         except Exception:
             import traceback
             util.write_log_error(
                 self.lock, self.logger,
                 'downloader thread catched exception.' +
                 traceback.print_exc())
示例#10
0
 def work_process(self):
     util.write_log_info(self.lock, self.logger,
                         'new tmpfile_hander process started')
     while 1:
         try:
             message = self.rd.brpop([
                 config.HIGH_FILE_HANDLE_QUEUE, config.LOW_FILE_HANDLE_QUEUE
             ], 0)
             util.write_log_info(self.lock, self.logger,
                                 'received new nessage: %s' % (message[1]))
             self.do_work(message[1])
         except Exception:
             import traceback
             util.write_log_error(
                 self.lock, self.logger,
                 "tmpfile_hander process catched exception." +
                 traceback.print_exc())
示例#11
0
 def normal(self, args):
     session = self.prepare_download_session(
         args['download_cookie_producer'], args['request_headers'])
     download_result, download_redirect_urls = self.download(session, args)
     if download_result == 'success':
         util.write_log_info(
             self.lock, self.logger,
             "SUCCESS : download_url: %s , file_name: %s " %
             (args['download_url'], args['file_name']))
         download_result = True
     else:
         util.write_log_error(
             self.lock, self.logger,
             "FAIL : download_url: %s ; reason: %s" %
             (args['download_url'], download_result))
         download_result = False
     return download_result, download_redirect_urls
示例#12
0
 def prepare_args(self, args_str):
     try:
         args = json.loads(args_str)
     except (TypeError, ValueError):
         util.write_log_error(self.lock, self.logger, 'can not parse %s to json args.'%(args_str))
         return None
     if not 'file_name' in args:
         util.write_log_error(self.lock, self.logger, 'file_name not found.')
         return None
     if not 'level' in args:
         args['level'] = 'low'
     if not 'target_folder' in args:
         args['target_folder'] = config.DEFAULT_FILE_HANDLE_FOLDER
     if not 'message_type' in args:
         args['message_type'] = config.MESSAGE_TYPE_NORMAL
     download_time = float(os.path.splitext(os.path.basename(args['file_name']))[0].split('_')[1])
     args['download_time'] = time.strftime('%Y-%m-%d/%H:%M:%S',time.localtime(download_time))
     return args
示例#13
0
 def crawl(self, args, browser):
     source = args['source']
     detail_page_url = args['detail_page_url']
     try:
         appstore_modual = __import__(source)
     except ImportError:
         util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores")
         return False
     appstore_class = appstore_modual.Store()
     if self.url_rd.exists(util.get_MD5(detail_page_url)):
         util.write_log_warn(self.lock, self.logger, "detail_page_url:" + detail_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT))
         return False
     browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT
     util.browser_get_url(browser, detail_page_url)
     download_url = appstore_class.get_download_url(browser)
     if download_url is None:
         util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url")
         return False
     out_json_message = appstore_class.make_detail2download_json_message(browser)
     out_json_message['detail_url'] = detail_page_url
     out_json_message['download_url'] = download_url
     if args['level'] = 'high':
         out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE
示例#14
0
 def prepare_args(self, args_str):
     try:
         args = json.loads(args_str)
     except (TypeError, ValueError):
         util.write_log_error(self.lock, self.logger,
                              'can not parse %s to json args.' % (args_str))
         return None
     if not 'file_name' in args:
         util.write_log_error(self.lock, self.logger,
                              'file_name not found.')
         return None
     if not 'level' in args:
         args['level'] = 'low'
     if not 'target_folder' in args:
         args['target_folder'] = config.DEFAULT_FILE_HANDLE_FOLDER
     if not 'message_type' in args:
         args['message_type'] = config.MESSAGE_TYPE_NORMAL
     download_time = float(
         os.path.splitext(os.path.basename(
             args['file_name']))[0].split('_')[1])
     args['download_time'] = time.strftime('%Y-%m-%d/%H:%M:%S',
                                           time.localtime(download_time))
     return args
示例#15
0
 def prepare_args(self, args_str):
     try:
         args = json.loads(args_str)
     except (TypeError, ValueError):
         util.write_log_error(self.lock, self.logger, 'can not parse %s to json args.'%(args_str))
         return None
     if not 'level' in args:
         args['level'] = 'low'
     if not 'source' in args:
         util.write_log_error(self.lock, self.logger, 'source not found.')
         return None
     if not 'detail_page_url' in args:
         util.write_log_error(self.lock, self.logger, 'detail_page_url not found.')
         return None
     # if not 'phantomjs_user_agent' in args:
     #     args['phantomjs_user_agent'] = config.PHANTOMJS_USER_AGENT
     if not 'message_type' in args:
         args['message_type'] = config.MESSAGE_TYPE_NORMAL
     return args