def jdholder2(queue, redis_client, JDBase, outlog, cookie=None, start=10, end=40): if cookie: JDBase.set_cookie(cookie) # JDBase.set_succeed_log_path(succeedlog) # JDBase.set_failed_log_path(failedlog) JDBase.set_redis_client(redis_client) JDBase.set_result_save_path(outlog) JDBase.set_useragent(fetch_util.get_pc_useragent()) task_id = queue.get()[0] task_id = fetch_util.byte_to_str(task_id) JDBase.set_request_path(task_id) JDBase.execute() fetch_util.print_log('process ' + JDBase.getshowlog() + '\t\n') # 获取结果是否成功 issucceed = JDBase.get_result() if issucceed: # 保存成功flag JDBase.save_succeed_log(task_id) else: # 保存失败flag JDBase.save_failed_log(task_id) # 睡眠 JDBase.sleep(start, end)
def init_task2(alltaskreadlines, succeedfile, failedfile): beforetasks = [] if os.path.exists(succeedfile) and os.path.getsize(succeedfile) > 0: slls = fetch_util.read(succeedfile) while True: try: sll = slls.__next__() beforetasks = fetch_util.remove_duplicate(sll, alltaskreadlines) except StopIteration: break else: beforetasks = alltaskreadlines nowtasks = [] if os.path.exists(failedfile) and os.path.getsize(failedfile) > 0: flls = fetch_util.read(failedfile) while True: try: fll = flls.__next__() nowtasks = fetch_util.remove_duplicate(fll, beforetasks) except StopIteration: break else: nowtasks = beforetasks fetch_util.print_log("failed file size = 0") return nowtasks
def jdholder(tasks, JDBase, succeedlog, failedlog, outlog, cookie=None, start=10, end=40): jd = JDBase # if cookie: jd.set_cookie(cookie) jd.set_succeed_log_path(succeedlog) jd.set_failed_log_path(failedlog) jd.set_result_save_path(outlog) taskslen = len(tasks) count = 0 for task in tasks: count += 1 jd.set_useragent(fetch_util.get_pc_useragent()) jd.set_request_path(task) jd.execute() fetch_util.print_log('process [' + str(count) + '/' + str(taskslen) + '] ' + ' ' + jd.getshowlog() + '\t\n') # 获取结果是否成功 issucceed = jd.get_result() if issucceed: # 保存成功flag jd.save_succeed_log(task) else: # 保存失败flag jd.save_failed_log(task) # 睡眠 jd.sleep(start, end)
def func(lists, maincategory, outfile): if lists and len(lists) > 0: alllen = len(lists) count = 0 for currenturl in lists: count += 1 pageinfo = PageInfo(currenturl) appname = pageinfo.getappname() category = pageinfo.getcategory() if not category: category = 'unknow' tag = pageinfo.gettag() if not tag: tag = 'unknow' # 打印日志 # captureutil.printlog(currenturl + '\t' + appname) fetch_util.print_log('[' + str(count) + '/' + str(alllen) + '] ' + currenturl) outinfo = currenturl + '\t' + maincategory + '>' + appname + '\tc:' + category + '\tt:' + tag # 写入结果 fetch_util.write(outinfo, outfile) # 随机休眠几秒 sleep(fetch_util.randnum(10, 40))
def match_type1(self, html, requesturl, sources): divtext = sources.get_text() currentresult = requesturl + '\t' + fetch_util.replace_some_string( divtext, '>', '>') itmenamehtml = html.find("div", {'id': 'name'}) if itmenamehtml: h1itemname = itmenamehtml.find('h1') itemname = h1itemname.contents[0] if itemname: currentresult += '>' + itemname # 保存请求成功的path price = get_price(requesturl) self.save_result(currentresult + price) fetch_util.print_log(currentresult + price) self.set_print_log(requesturl + ' succeed') self.set_result(True)
def main(): # 开始时间 starttime = time() # 日志输出目录 dirpath = jdconfig.jd_out_dir fetch_util.mkdirs(dirpath) # 输出结果文件 outlog = dirpath + jdconfig.jd_out redis_pool = redis_util.get_redis_pool_connection() redis_client = redis.Redis(connection_pool=redis_pool) # all_task_size = task_dispatch.get_all_task_size(redis_client, 'jd_20161024') task_iter = task_dispatch.get_all_task_iter(redis_client, 'jd_20161024') queue = Queue() task_dispatch.get_task_queue(queue, task_iter) task_size = queue.qsize() if task_size == 0: fetch_util.print_log("任务总数为0,结束任务") return fetch_util.print_log("任务总数: " + str(task_size)) # 设置cookie cookie = jdutil.jd_pc_cookie('beijing') jd = JDPage() while True: threads = [] for i in range(jdconfig.thread_num): # th = threading.Thread(target=jdutil.jdholder, args=(task, jd, succeedlog, failedlog, outlog, cookie, 0, 5)) th = threading.Thread(target=jdutil.jdholder2, args=(queue, redis_client, jd, outlog, cookie, 5, 10)) th.start() threads.append(th) for th in threads: th.join() # 结束时间 endtime = time() fetch_util.print_log("网页数据获取结束,耗时: " + str(float(endtime - starttime) / 60) + "分钟") # 写入耗时结果 costtimepath = dirpath + 'jd_' + fetch_util.get_time_yyyymmddhhmmss() + '.txt' fetch_util.write(str(float(endtime - starttime) / 60), costtimepath, 'w') pass
def init_task(urlsfile, succeedfile, failedfile): exist = fetch_util.file_exist(urlsfile) if not exist: fetch_util.print_log(urlsfile + " not exist") return [] with open(urlsfile, 'r') as alltaskopener: alltaskreadlines = alltaskopener.readlines() succeedfilereadlines = [] if os.path.exists(succeedfile): with open(succeedfile, "r") as succeedfileopener: succeedfilereadlines = succeedfileopener.readlines() failedfilereadlines = [] if os.path.exists(failedfile): with open(failedfile, "r") as failedfileopener: failedfilereadlines = failedfileopener.readlines() beforetasks = fetch_util.remove_duplicate(succeedfilereadlines, alltaskreadlines) nowtasks = fetch_util.remove_duplicate(failedfilereadlines, beforetasks) return nowtasks
def jdholder2(task, JDBase, succeedlog, failedlog, outlog, cookie=None, start=10, end=40): if cookie: JDBase.set_cookie(cookie) JDBase.set_succeed_log_path(succeedlog) JDBase.set_failed_log_path(failedlog) JDBase.set_result_save_path(outlog) JDBase.set_useragent(fetch_util.get_pc_useragent()) JDBase.set_request_path(task) JDBase.execute() fetch_util.print_log('process ' + JDBase.getshowlog() + '\t\n') # 获取结果是否成功 issucceed = JDBase.get_result() if issucceed: # 保存成功flag JDBase.save_succeed_log(task) else: # 保存失败flag JDBase.save_failed_log(task) # 睡眠 JDBase.sleep(start, end)
def save_result(path, request_url, bread_tag_name, price): if bread_tag_name: save_info = request_url + '\t' + bread_tag_name + '\t' + price fetch_util.print_log(request_url + ' succeed') fetch_util.print_log_debug(save_info) if path: fetch_util.write(save_info, path) else: fetch_util.print_log('****** 未设置结果存储路径!!! ******') else: fetch_util.print_log(request_url + ' failed')
def main(): # 指定大类别url outfile = '/Users/Lan/TestDir/out/wandoujia.txt' # 文件输出位置 specurls = ['http://www.wandoujia.com/category/396'] # outfile = wandoujiaconfig.outfile # specurls = wandoujiaconfig.specurls # specurls = ['http://www.wandoujia.com/category/382', 'http://www.wandoujia.com/category/388', # 'http://www.wandoujia.com/category/402', 'http://www.wandoujia.com/category/392'] allurls = [] fetch_util.print_log('update request urls ...') for specurl in specurls: maincategoryurls = MainCategoryUrls(specurl) url = maincategoryurls.geturls() allurls.append(url) urls = fetch_util.liststolist(allurls) fetch_util.print_log('update request urls finished, len: ' + str(len(urls))) for url in urls: parentpage = ParentPage(url=url) requesturls = parentpage.getpageurls() if requesturls and len(requesturls) > 0: for requesturl in requesturls: parentpageurl = ParentPageUrl(requesturl) # 当前主大类别 maincategory = parentpageurl.getcategory() if not maincategory: maincategory = 'unknow' # 当前页面可请求urls currenturls = parentpageurl.getcurpageurls() if currenturls and len(currenturls) > 0: tasks = fetch_util.task_dispatch(currenturls, 10) threads = [] for task in tasks: th = threading.Thread(target=func, args=(task, maincategory, outfile)) th.start() threads.append(th) pass for th in threads: th.join() # 写入结果 fetch_util.write('\r\n------ i am line -----\r\n', outfile) fetch_util.print_log("has finish: " + url) sleep(fetch_util.randnum(10, 30))