def prepare_args(self, args_str): try: args = json.loads(args_str) except (TypeError, ValueError): util.write_log_error(self.lock, self.logger, "can not parse %s to json args." % (args_str)) return None if not "download_url" in args: util.write_log_error(self.lock, self.logger, "download_url not found.") return None if not "level" in args: args["level"] = "low" if not "method" in args: args["method"] = config.REQUESTS_ACTION_GET if not "data" in args: args["data"] = None if not "request_headers" in args: args["request_headers"] = config.REQUEST_HEADERS if not "download_cookie_producer" in args: args["download_cookie_producer"] = None if not "download_folder" in args: args["download_folder"] = config.DEFAULT_DOWNLOAD_FOLDER if not "download_file_name" in args: args["download_file_name"] = self.DEFAULT_DOWNLOAD_FILE_NAME() if not "download_file_type" in args: args["download_file_type"] = config.DEFAULT_DOWNLOAD_FILE_TYPE if not "target_folder" in args: args["target_folder"] = config.DEFAULT_FILE_HANDLE_FOLDER if not "message_type" in args: args["message_type"] = config.MESSAGE_TYPE_NORMAL return args
def crawl(self, args, browser): source = args['source'] list_page_url = args['list_page_url'] try: appstore_modual = __import__(source) except ImportError: util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores") return False appstore_class = appstore_modual.Store() if self.url_rd.exists(util.get_MD5(list_page_url)): util.write_log_warn(self.lock, self.logger, "list_page_url:" + list_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT)) return False browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT util.browser_get_url(browser, list_page_url) # appstore_class.scroll_down(browser) check_more_count = appstore_class.click_checkmore(browser) util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + "点击checkmore" + str(check_more_count) + "次后找不到checkmore") detail_urls = appstore_class.get_detail_urls(browser) if len(detail_urls) == 0: util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + " 找到0个app") self.push_detail_page_message(args, detail_urls) next_list_page = appstore_class.click_nextpage(browser) if next_list_page: self.push_list_page_message(args, next_list_page) util.write_log_info(self.lock, self.logger,"SUCCESS : list_page_url: %s crawled"%(list_page_url)) self.url_rd.set(util.get_MD5(list_page_url),1) self.url_rd.expire(util.get_MD5(list_page_url), config.CRAWLED_URL_REDIS_TIMEOUT) return True
def prepare_args(self, args_str): try: args = json.loads(args_str) except (TypeError, ValueError): util.write_log_error(self.lock, self.logger, 'can not parse %s to json args.' % (args_str)) return None if not 'download_url' in args: util.write_log_error(self.lock, self.logger, 'download_url not found.') return None if not 'level' in args: args['level'] = 'low' if not 'method' in args: args['method'] = config.REQUESTS_ACTION_GET if not 'data' in args: args['data'] = None if not 'request_headers' in args: args['request_headers'] = config.REQUEST_HEADERS if not 'download_cookie_producer' in args: args['download_cookie_producer'] = None if not 'download_folder' in args: args['download_folder'] = config.DEFAULT_DOWNLOAD_FOLDER if not 'download_file_name' in args: args['download_file_name'] = self.DEFAULT_DOWNLOAD_FILE_NAME() if not 'download_file_type' in args: args['download_file_type'] = config.DEFAULT_DOWNLOAD_FILE_TYPE if not 'target_folder' in args: args['target_folder'] = config.DEFAULT_FILE_HANDLE_FOLDER if not 'message_type' in args: args['message_type'] = config.MESSAGE_TYPE_NORMAL return args
def work_process(self, browser): util.write_log_info(self.lock, self.logger, 'new detail_page_handler process started') while 1: try: message = self.rd.brpop([config.HIGH_DETAIL_PAGE_HANDLE_QUEUE, config.LOW_DETAIL_PAGE_HANDLE_QUEUE],0) util.write_log_info(self.lock, self.logger, 'received new nessage: %s'%(message[1])) self.do_work(message[1], browser) except Exception: import traceback util.write_log_error(self.lock, self.logger,"detail_page_handler process catched exception." + traceback.print_exc())
def work_process(self): util.write_log_info(self.lock, self.logger, 'new tmpfile_hander process started') while 1: try: message = self.rd.brpop([config.HIGH_FILE_HANDLE_QUEUE, config.LOW_FILE_HANDLE_QUEUE],0) util.write_log_info(self.lock, self.logger, 'received new nessage: %s'%(message[1])) self.do_work(message[1]) except Exception: import traceback util.write_log_error(self.lock, self.logger,"tmpfile_hander process catched exception." + traceback.print_exc())
def launch(self): while len(self.phantomjs_webdrivers)<config.MAX_DETAIL_PAGE_HANDLER_PROCESS_NUM: phantomJS_browser = util.start_phantomJS_browser() if phantomJS_browser: self.phantomjs_webdrivers.append(phantomJS_browser) process = multiprocessing.Process(target = self.work_process, args=(phantomJS_browser, )) process.daemon = True self.processes.append(process) process.start() else: util.write_log_error(self.lock, self.logger,"phantomJS start failed.")
def work_thread(self): util.write_log_info(self.lock, self.logger, "new downloader thread started") while 1: try: message = self.rd.brpop([config.HIGH_DOWNLOAD_QUEUE, config.LOW_DOWNLOAD_QUEUE], 0) util.write_log_info(self.lock, self.logger, "received new nessage: %s" % (message[1])) self.do_work(message[1]) except Exception: import traceback util.write_log_error( self.lock, self.logger, "downloader thread catched exception." + traceback.print_exc() )
def normal(self, args): session = self.prepare_download_session(args["download_cookie_producer"], args["request_headers"]) download_result, download_redirect_urls = self.download(session, args) if download_result == "success": util.write_log_info( self.lock, self.logger, "SUCCESS : download_url: %s , file_name: %s " % (args["download_url"], args["file_name"]), ) download_result = True else: util.write_log_error( self.lock, self.logger, "FAIL : download_url: %s ; reason: %s" % (args["download_url"], download_result) ) download_result = False return download_result, download_redirect_urls
def work_thread(self): util.write_log_info(self.lock, self.logger, 'new downloader thread started') while 1: try: message = self.rd.brpop( [config.HIGH_DOWNLOAD_QUEUE, config.LOW_DOWNLOAD_QUEUE], 0) util.write_log_info(self.lock, self.logger, 'received new nessage: %s' % (message[1])) self.do_work(message[1]) except Exception: import traceback util.write_log_error( self.lock, self.logger, 'downloader thread catched exception.' + traceback.print_exc())
def work_process(self): util.write_log_info(self.lock, self.logger, 'new tmpfile_hander process started') while 1: try: message = self.rd.brpop([ config.HIGH_FILE_HANDLE_QUEUE, config.LOW_FILE_HANDLE_QUEUE ], 0) util.write_log_info(self.lock, self.logger, 'received new nessage: %s' % (message[1])) self.do_work(message[1]) except Exception: import traceback util.write_log_error( self.lock, self.logger, "tmpfile_hander process catched exception." + traceback.print_exc())
def normal(self, args): session = self.prepare_download_session( args['download_cookie_producer'], args['request_headers']) download_result, download_redirect_urls = self.download(session, args) if download_result == 'success': util.write_log_info( self.lock, self.logger, "SUCCESS : download_url: %s , file_name: %s " % (args['download_url'], args['file_name'])) download_result = True else: util.write_log_error( self.lock, self.logger, "FAIL : download_url: %s ; reason: %s" % (args['download_url'], download_result)) download_result = False return download_result, download_redirect_urls
def prepare_args(self, args_str): try: args = json.loads(args_str) except (TypeError, ValueError): util.write_log_error(self.lock, self.logger, 'can not parse %s to json args.'%(args_str)) return None if not 'file_name' in args: util.write_log_error(self.lock, self.logger, 'file_name not found.') return None if not 'level' in args: args['level'] = 'low' if not 'target_folder' in args: args['target_folder'] = config.DEFAULT_FILE_HANDLE_FOLDER if not 'message_type' in args: args['message_type'] = config.MESSAGE_TYPE_NORMAL download_time = float(os.path.splitext(os.path.basename(args['file_name']))[0].split('_')[1]) args['download_time'] = time.strftime('%Y-%m-%d/%H:%M:%S',time.localtime(download_time)) return args
def crawl(self, args, browser): source = args['source'] detail_page_url = args['detail_page_url'] try: appstore_modual = __import__(source) except ImportError: util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores") return False appstore_class = appstore_modual.Store() if self.url_rd.exists(util.get_MD5(detail_page_url)): util.write_log_warn(self.lock, self.logger, "detail_page_url:" + detail_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT)) return False browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT util.browser_get_url(browser, detail_page_url) download_url = appstore_class.get_download_url(browser) if download_url is None: util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url") return False out_json_message = appstore_class.make_detail2download_json_message(browser) out_json_message['detail_url'] = detail_page_url out_json_message['download_url'] = download_url if args['level'] = 'high': out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE
def prepare_args(self, args_str): try: args = json.loads(args_str) except (TypeError, ValueError): util.write_log_error(self.lock, self.logger, 'can not parse %s to json args.' % (args_str)) return None if not 'file_name' in args: util.write_log_error(self.lock, self.logger, 'file_name not found.') return None if not 'level' in args: args['level'] = 'low' if not 'target_folder' in args: args['target_folder'] = config.DEFAULT_FILE_HANDLE_FOLDER if not 'message_type' in args: args['message_type'] = config.MESSAGE_TYPE_NORMAL download_time = float( os.path.splitext(os.path.basename( args['file_name']))[0].split('_')[1]) args['download_time'] = time.strftime('%Y-%m-%d/%H:%M:%S', time.localtime(download_time)) return args
def prepare_args(self, args_str): try: args = json.loads(args_str) except (TypeError, ValueError): util.write_log_error(self.lock, self.logger, 'can not parse %s to json args.'%(args_str)) return None if not 'level' in args: args['level'] = 'low' if not 'source' in args: util.write_log_error(self.lock, self.logger, 'source not found.') return None if not 'detail_page_url' in args: util.write_log_error(self.lock, self.logger, 'detail_page_url not found.') return None # if not 'phantomjs_user_agent' in args: # args['phantomjs_user_agent'] = config.PHANTOMJS_USER_AGENT if not 'message_type' in args: args['message_type'] = config.MESSAGE_TYPE_NORMAL return args