def getPageInARow(url, searchword, firstPage, topTabList, elementUrl): browser = buildSplinterBrowser("chrome") browser.visit(url) browserWaitTime(browser) searchwordKeyInAndEnter(browser, searchword) browser.driver.set_window_size(1024,768) forSureNws = findOutNws(browser, topTabList) keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop() # 擬人化mouse_over要排除新聞tab topTabList.remove(int(keyNews)) print(f"點擊 {keyNews} 去到 新聞頁") #點擊新聞tab browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click() timeSleepRandomly() newsDict = {} newsDictInner = {} while True: print(f"進行 {searchword} 第", firstPage, "頁") elementUrlExtract(browser, topTabList, elementUrl, newsDictInner) judgment = judgeNextPage(browser) if judgment: print("仍有下一頁,繼續爬取!") firstPage += 1 pass else: browser.quit() break newsDict["dateTime"] = timeStampGenerator() newsDict["keyword"] = searchword newsDict["newsTotalNum"] = len(newsDictInner) newsDict["newsUrl"] = newsDictInner return newsDict
def getPageInARowAdvanced(input, objectiveFolder, objective): thisPID = os.getpid() while True: print(thisPID, "===========================================") consecutiveUrl = input.get() searchword, page, totalPage, url = consecutiveUrl.split('+') # print(url) print( f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁:" ) # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(3): try: timeSleepFour() browser = buildSplinterBrowserHeadless('chrome') timeSleepRandomly() browser.visit(url) browserWaitTime(browser) timeSleepTwo() tempHtml = browser.html timeSleepRandomly() soup = BeautifulSoup(tempHtml, 'html.parser') print(f"讀取{searchword}第 {page} 頁,成功!") break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。", e) print( f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!" ) timeSleepFour() timeSleepRandomly() soup = "" # else: # print(f"讀取{searchword}第 {page} 頁,成功!") if not soup: badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest" with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a", newline='', encoding='utf-8') as f: # newline沒作用... errorMessage = url + "\n" f.write(errorMessage) #writelines作用在errorMessage是list時 with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt", 'w', encoding='utf-8') as f: f.write(str(soup)) print() print(f'{thisPID} 成功寫出 {searchword} 第{page}頁,總共{totalPage} 頁。') try: browser.quit() print( f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++" ) except: print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。") print( f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) os.kill(thisPID, signal.SIGKILL) input.task_done() #通知main process此次的input處理完成! end = timeCalculate() print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective): thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, thisPID)) print() eraseRawData(objectiveFolder, objective, searchword) mkdirForRawData(objectiveFolder, objective, searchword) url = keywordUrlPair[searchword] # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(3): try: timeSleepOne() timeSleepRandomly() browser = buildSplinterBrowserHeadless('chrome') timeSleepRandomly() browser.visit(url) browserWaitTime(browser) timeSleepTwo() tempHtml = browser.html timeSleepRandomly() soup = BeautifulSoup(tempHtml, 'html.parser') print( f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!" ) break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARow_proc} 讀取{searchword}第 1 頁有問題。", e) print( f"{thisPID}__{getPageInARow_proc} 重建browser物件,進行再處理 {i} 次!" ) timeSleepFour() timeSleepRandomly() soup = "" # else: # print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!") try: totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text), 30) except AttributeError as e: print("getPageInARow 出錯", e) # 讓程式強制停下來 raise print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁') print() with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt", 'w', encoding='utf-8') as f: f.write(str(soup)) print() print(f'成功寫出 {searchword} 第 1 頁') i_browser = 1 try: browser.quit() print( f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++" ) except: print( f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。") i_browser += 1 print( f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) os.kill(thisPID, signal.SIGKILL) # 休息久一點,讓所有searchword的第一頁都有被讀到。 timeSleepEight() timeSleepEight() for num in range(2, totalPage + 1): strNum = str(num) consecutiveData = searchword + "+" + strNum + "+" + str( totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}", url) output.put(consecutiveData) # print(f'這裡是getPageInARow,準備送給 getPageInARowAdvanced 處理: {searchword} 的 第 {strNum} 頁,總共{totalPage}') print() input.task_done() #通知main process此次的input處理完成! end = timeCalculate() print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
def humanSimulate(cls, browser): """ 之所以click()不成功是因為mouse_over後,browser視窗看不到要點擊的xpath了! WebDriverException: Message: unknown error: Element <a class="selected">...</a> is not clickable at point (338, 13). Other element would receive the click: <div id="bt_0_002_01" class="">...</div> (Session info: chrome=80.0.3987.122) (Driver info: chromedriver=2.36.540471 (9c759b81a907e70363c6312294d30b6ccccc2752),platform=Linux 4.15.0-65-generic x86_64) 但依舊可以用boolean的方式判斷;不過視窗的移動,不影響mouse_over() if browser.is_element_present_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a'): print(1) """ searchTypeList = [row for row in range(1, 5)] pageList = [row for row in range(1, 13)] # 頁碼欄的第一頁只有12項 brandAndClassList = [row for row in range(2)] randomTypeNum = random.choice(searchTypeList) randomPageNum = random.choice(pageList) randomBrandClassNum = random.choice(brandAndClassList) try: try: # 針對頁碼,最多12項;在置底頁時,選項不會12項足項 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{randomPageNum}]/a' ).mouse_over() browserWaitTime(browser) except AttributeError as e: # 找不到element 來mouse_over() ; print("頁碼不足12項___擬人化操作找不到 Element。", e) browserWaitTime(browser) browser.execute_script( 'window.scrollTo(0, document.body.scrollHeight);') timeSleepOne() # 針對準確度...價格等4項 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[3]/span/ul/li[{randomTypeNum}]' ).mouse_over() timeSleepOne() #針對商標與商品分類選單 if randomBrandClassNum: if browser.is_element_present_by_xpath( '//*[@id="categoriesBtn"]'): browser.find_by_xpath( '//*[@id="categoriesBtn"]').mouse_over() elif browser.is_element_present_by_xpath( '//*[@id="bt_0_layout_b203"]'): browser.find_by_xpath( '//*[@id="bt_0_layout_b203"]').mouse_over() else: if browser.is_element_present_by_xpath( '//*[@id="bt_0_layout_b203"]'): browser.find_by_xpath( '//*[@id="bt_0_layout_b203"]').mouse_over() elif browser.is_element_present_by_xpath( '//*[@id="categoriesBtn"]'): browser.find_by_xpath( '//*[@id="categoriesBtn"]').mouse_over() timeSleepOne() browser.execute_script('window.scrollTo(0,0);') except AttributeError as e: # 找不到element 來mouse_over() ; print("擬人化操作找不到 Element。", e)
def browserClickPageNumber(cls, browser, currentPage, totalPage, searchword): """ #點擊頁數 # 預設 # 1 2 3 4 5 6 7 8 9 10 >> >| # 頂 上10頁 # 1 14 # |< << 11 12 13 14 15 16 17 18 19 20 >> >| # 置底 # |< << 281 282 283 284 # accuratePage = browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[8]/a') accuratePage = browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a') accuratePage.text """ currentPageNum = int(currentPage) totalPageNum = int(totalPage) halfTotalPageNum = totalPageNum // 2 if currentPageNum > halfTotalPageNum and currentPageNum > 10: #去到置底頁 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[12]/a').click() timeSleepOne() if currentPageNum != totalPageNum and currentPageNum // 10 == totalPageNum // 10: if currentPageNum % 10 != 0: # 13、18 clickBeforeTimes = 0 elif currentPageNum % 10 == 0: # 290、299 clickBeforeTimes = 1 #反方向點擊到正確頁數的畫面 for i in range(clickBeforeTimes): browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[2]/a' ).click() # timeSleepRandomly() # timeSleepOne() browserWaitTime(browser) elif currentPageNum != totalPageNum and currentPageNum // 10 < totalPageNum // 10: if currentPageNum % 10 != 0 and totalPageNum % 10 == 0: # and totalPageNum - currentPageNum < 10: # 281、290 # 271、290 # 11、30 clickBeforeTimes = (totalPageNum // 10) - (currentPageNum // 10) - 1 elif currentPageNum % 10 != 0 and totalPageNum % 10 != 0: # and totalPageNum - currentPageNum >= 10: # 271、291 # 18、23 clickBeforeTimes = (totalPageNum // 10) - (currentPageNum // 10) elif currentPageNum % 10 == 0 and totalPageNum % 10 != 0: # 270、291 clickBeforeTimes = (totalPageNum // 10) - (currentPageNum // 10) + 1 elif currentPageNum % 10 == 0 and totalPageNum % 10 == 0: # 270、290 clickBeforeTimes = (totalPageNum // 10) - (currentPageNum // 10) #反方向點擊到正確頁數的畫面 for i in range(clickBeforeTimes): browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[2]/a' ).click() # timeSleepRandomly() # timeSleepOne() browserWaitTime(browser) #點擊到正確頁碼 judgeNum = currentPageNum % 10 if judgeNum: clickNum = judgeNum + 2 elif judgeNum == 0: clickNum = judgeNum + 12 print( f"反方向__{searchword}__目標頁碼:{currentPage}, 點擊項次:{clickNum}, 總頁數:{totalPage}" ) browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a' ).click() accuratePage = browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a' ).text print( f"反方向__{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}" ) else: if currentPageNum <= 10: browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a' ).click() accuratePage = browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a' ).text elif 11 <= currentPageNum <= 20: #去到11~20頁 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a' ).click() clickNum = currentPageNum - 10 + 2 timeSleepOne() browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a' ).click() accuratePage = browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a' ).text else: #去到11~20頁 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a' ).click() if currentPageNum % 10 == 0: # 電冰箱__目標頁碼:290, 點擊頁碼:300, 總頁數:921 clickNextTimes = currentPageNum // 10 - 1 else: # 冰箱__目標頁碼:292, 點擊頁碼:292, 總頁數:921 clickNextTimes = currentPageNum // 10 #點擊到正確頁數的畫面 for i in range(clickNextTimes - 1): #扣1是因為已經「#去到11~20頁」 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[13]/a' ).click() # timeSleepRandomly() # timeSleepOne() browserWaitTime(browser) #點擊到正確頁碼 judgeNum = currentPageNum - (clickNextTimes * 10) if judgeNum: clickNum = judgeNum + 2 elif judgeNum == 0: clickNum = judgeNum + 12 browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a' ).click() accuratePage = browser.find_by_xpath( f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a' ).text print( f"{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}" )
def getPageInARow(input, url, firstPage, topTabList, elementUrl, objectiveFolder, objective, *args): begin = timeCalculate() thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() mkdirForRawData(objectiveFolder, objective, "google", keyword=searchword) browser = buildSplinterBrowserHeadless("chrome") browser.visit(url) browserWaitTime(browser) searchwordKeyInAndEnter(browser, searchword) browser.driver.set_window_size(1024, 768) forSureNws = findOutNws(browser, topTabList) keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop() # 擬人化mouse_over要排除新聞tab topTabList.remove(int(keyNews)) print(f"點擊 topTabList {keyNews} 去到 新聞頁") #點擊新聞tab browser.find_by_xpath( f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click() timeSleepRandomly() newsDict = {} newsDictInner = {} while True: print(f"進行 {searchword} 第", firstPage, "頁") elementUrlExtract(browser, firstPage, topTabList, elementUrl, newsDictInner, searchword) judgment = judgeNextPage(browser, searchword) if judgment: print(f"『{searchword}』 仍有下一頁,繼續爬取!") firstPage += 1 pass else: browser.quit() break timeStamp = timeStampGenerator() newsTotalNum = len(newsDictInner) newsDict["dateTime"] = timeStamp newsDict["keyword"] = searchword newsDict["newsTotalNum"] = newsTotalNum newsDict["newsUrl"] = newsDictInner with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json", 'w', encoding='utf-8') as f: json.dump(newsDict, f, indent=2, ensure_ascii=False) print( f'{thisPID} 成功寫出 google_{timeStamp}_{newsTotalNum}_{searchword}.json ' ) input.task_done() end = timeCalculate() print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
from libs.splinterBrowser import browserWaitTime from libs.timeWidget import timeCalculate from libs.timeWidget import timeStampGenerator from libs.manipulateDir import mkdirForRawData from libs.manipulateDir import eraseRawData if __name__ == '__main__': objectiveFolder = "rawData" objective = "observationStation" begin = timeCalculate() browser = buildSplinterBrowser("chrome") browserWaitTime(browser) browser.visit("http://e-service.cwb.gov.tw/HistoryDataQuery/") #等待地圖的JS出來 browser.is_element_present_by_xpath('//*[@id="con_r"]/div/div[1]', wait_time=5) soup = BeautifulSoup(browser.html, "html.parser") browser.quit() print("==============================quit==============================") eraseRawData(objectiveFolder, objective, "overviewData") mkdirForRawData(objectiveFolder, objective, "overviewData")
def getPageInARow(input, output, folderWorker, momoMallBrowser): thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, thisPID)) folderWorker.eraseRawData(searchword) folderWorker.mkdirForRawData(searchword) url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[ searchword] # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(4): try: timeSleepOne() timeSleepRandomly() browser = momoMallBrowser.intersectionForCrawl( folderWorker.objective) timeSleepRandomly() browser.visit(url) browserWaitTime(browser) timeSleepTwo() #點擊「準確度」,頁數跳至第1頁 try: buyingTendency = momoMallBrowser.browserClickSearchType( browser, 1) browserWaitTime(browser) timeSleepTwo() except AttributeError as e: print( f"{thisPID}__{getPageInARow_proc} {searchword} 第1頁 點擊準確度有問題。", e) print( f"{thisPID}__{getPageInARow_proc} 重建browser物件,進行再處理 {i} 次!" ) browserQuit(browser, thisPID, getPageInARow_proc) timeSleepFour() soup = "" continue tempHtml = browser.html timeSleepRandomly() soup = BeautifulSoup(tempHtml, 'lxml') print( f"-----------------讀取{searchword}_{buyingTendency}第 1 頁-----------------成功!" ) try: ## current page and total page '頁數5/286' pageState = browser.find_by_xpath( '//*[@id="bt_2_layout_Content"]/div[2]/dl/dt/span') totalPage = int(pageState.text.split('/')[1]) currentPage = int( numsHandler.searchFloatNums( pageState.text.split('/')[0])) print( f"-----------------讀取{searchword}_{buyingTendency} 總頁數-----------------成功!" ) except AttributeError as e: print(f"getPageInARow __{searchword}__出錯", e, "重抓一次!") # 讓程式強制停下來 # 觀察下來,「raise」只會讓當前執行的process停下來,並不會讓「整體」process停下來。 # 因此不適合用「raise」。 # raise currentPage = 1 # 自訂 totalPage = 3 # 自訂 continue break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARow_proc} 讀取{searchword}第 1 頁有問題。", e) print( f"{thisPID}__{getPageInARow_proc} 重建browser物件,進行再處理 {i} 次!" ) browserQuit(browser, thisPID, getPageInARow_proc) timeSleepFour() timeSleepRandomly() soup = "" except StaleElementReferenceException as e: print( "----------------StaleElementReferenceException----------------" ) print( f"{thisPID}__{getPageInARow_proc} 讀取{searchword}第 1 頁有問題。", e) print( f"{thisPID}__{getPageInARow_proc} 重建browser物件,進行再處理 {i} 次!" ) browserQuit(browser, thisPID, getPageInARow_proc) timeSleepFour() timeSleepRandomly() soup = "" if not soup: errorMessage = f"{url}__{currentPage}__" + "\n" folderWorker.writeOutFile( f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest", f"badRequest_{searchword}.txt", errorMessage, writeOutType="a") folderWorker.writeOutFile( f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}", f"1_{totalPage}_{searchword}.txt", soup) print(f'成功寫出 {searchword} 第 {currentPage} 頁') print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁') browserQuit(browser, thisPID, getPageInARow_proc) # 休息久一點,讓所有searchword的第一頁都有被讀到。 timeSleepEight() timeSleepEight() timeSleepEight() for num in range(2, totalPage + 1): strNum = str(num) consecutiveData = searchword + "+" + strNum + "+" + str(totalPage) output.put(consecutiveData) # print(f'這裡是getPageInARow,準備送給 getPageInARowAdvanced 處理: {searchword} 的 第 {strNum} 頁,總共{totalPage}') # print() input.task_done() #通知main process此次的input處理完成! end = timeCalculate() print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
def getPageInARowAdvanced(input, folderWorker, momoMallBrowser): """ 開始對POST網址進行splinter的點擊 """ thisPID = os.getpid() while True: # print(thisPID,"===========================================") consecutiveData = input.get() searchword, currentPage, totalPage = consecutiveData.split('+') url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[ searchword] # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(4): try: timeSleepFour() browser = momoMallBrowser.intersectionForCrawl( folderWorker.objective) timeSleepRandomly() browserSetWindowSize(browser, horizon=1920, vertical=1080) timeSleepOne() browser.visit(url) browserWaitTime(browser) timeSleepTwo() #點擊「準確度」,頁數跳至第1頁 try: buyingTendency = momoMallBrowser.browserClickSearchType( browser, 1) browserWaitTime(browser) timeSleepTwo() except AttributeError as e: print( f"{thisPID}__{getPageInARowAdvanced_proc} {searchword} 在第{currentPage}頁點擊準確度有問題。", e) print( f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!" ) browserQuit(browser, thisPID, getPageInARowAdvanced_proc) timeSleepFour() soup = "" continue # 點擊至正確的頁數 momoMallBrowser.browserClickPageNumber(browser, currentPage, totalPage, searchword) tempHtml = browser.html timeSleepRandomly() #擬人 momoMallBrowser.humanSimulate(browser) soup = BeautifulSoup(tempHtml, 'lxml') # print(f"讀取{searchword}第 {currentPage} 頁,成功!") break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {currentPage} 頁有問題。", e) print( f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!" ) browserQuit(browser, thisPID, getPageInARowAdvanced_proc) timeSleepFour() timeSleepRandomly() soup = "" # else: # print(f"讀取{searchword}第 {page} 頁,成功!") if not soup: errorMessage = f"{url}__{currentPage}__" + "\n" folderWorker.writeOutFile( f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest", f"badRequest_{searchword}.txt", errorMessage, writeOutType="a") folderWorker.writeOutFile( f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}", f"{currentPage}_{totalPage}_{searchword}.txt", soup) # print(f'{thisPID} 成功寫出 {searchword} 第{currentPage}頁,總共{totalPage} 頁。') browserQuit(browser, thisPID, getPageInARowAdvanced_proc) input.task_done() #通知main process此次的input處理完成! end = timeCalculate()