def intersectionForCrawl(self, ecommerceName, ecommerceLink=None): if ecommerceName == self._eCommerceMomoMall: browser = buildSplinterBrowserHeadless('chrome') return browser else: return None
def requestsHandlingWhenTimeoutOccur(url, browserName): timeSleepEight() browser = buildSplinterBrowserHeadless(browserName) timeSleepRandomly() browser.visit(url)
def getPageInARowAdvanced(input, objectiveFolder, objective): thisPID = os.getpid() while True: print(thisPID, "===========================================") consecutiveUrl = input.get() searchword, page, totalPage, url = consecutiveUrl.split('+') # print(url) print( f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁:" ) # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(3): try: timeSleepFour() browser = buildSplinterBrowserHeadless('chrome') timeSleepRandomly() browser.visit(url) browserWaitTime(browser) timeSleepTwo() tempHtml = browser.html timeSleepRandomly() soup = BeautifulSoup(tempHtml, 'html.parser') print(f"讀取{searchword}第 {page} 頁,成功!") break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。", e) print( f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!" ) timeSleepFour() timeSleepRandomly() soup = "" # else: # print(f"讀取{searchword}第 {page} 頁,成功!") if not soup: badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest" with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a", newline='', encoding='utf-8') as f: # newline沒作用... errorMessage = url + "\n" f.write(errorMessage) #writelines作用在errorMessage是list時 with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt", 'w', encoding='utf-8') as f: f.write(str(soup)) print() print(f'{thisPID} 成功寫出 {searchword} 第{page}頁,總共{totalPage} 頁。') try: browser.quit() print( f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++" ) except: print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。") print( f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) os.kill(thisPID, signal.SIGKILL) input.task_done() #通知main process此次的input處理完成! end = timeCalculate() print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective): thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, thisPID)) print() eraseRawData(objectiveFolder, objective, searchword) mkdirForRawData(objectiveFolder, objective, searchword) url = keywordUrlPair[searchword] # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(3): try: timeSleepOne() timeSleepRandomly() browser = buildSplinterBrowserHeadless('chrome') timeSleepRandomly() browser.visit(url) browserWaitTime(browser) timeSleepTwo() tempHtml = browser.html timeSleepRandomly() soup = BeautifulSoup(tempHtml, 'html.parser') print( f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!" ) break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARow_proc} 讀取{searchword}第 1 頁有問題。", e) print( f"{thisPID}__{getPageInARow_proc} 重建browser物件,進行再處理 {i} 次!" ) timeSleepFour() timeSleepRandomly() soup = "" # else: # print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!") try: totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text), 30) except AttributeError as e: print("getPageInARow 出錯", e) # 讓程式強制停下來 raise print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁') print() with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt", 'w', encoding='utf-8') as f: f.write(str(soup)) print() print(f'成功寫出 {searchword} 第 1 頁') i_browser = 1 try: browser.quit() print( f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++" ) except: print( f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。") i_browser += 1 print( f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) os.kill(thisPID, signal.SIGKILL) # 休息久一點,讓所有searchword的第一頁都有被讀到。 timeSleepEight() timeSleepEight() for num in range(2, totalPage + 1): strNum = str(num) consecutiveData = searchword + "+" + strNum + "+" + str( totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}", url) output.put(consecutiveData) # print(f'這裡是getPageInARow,準備送給 getPageInARowAdvanced 處理: {searchword} 的 第 {strNum} 頁,總共{totalPage}') print() input.task_done() #通知main process此次的input處理完成! end = timeCalculate() print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
def getPageInARow(input, url, firstPage, topTabList, elementUrl, objectiveFolder, objective, *args): begin = timeCalculate() thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() mkdirForRawData(objectiveFolder, objective, "google", keyword=searchword) browser = buildSplinterBrowserHeadless("chrome") browser.visit(url) browserWaitTime(browser) searchwordKeyInAndEnter(browser, searchword) browser.driver.set_window_size(1024, 768) forSureNws = findOutNws(browser, topTabList) keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop() # 擬人化mouse_over要排除新聞tab topTabList.remove(int(keyNews)) print(f"點擊 topTabList {keyNews} 去到 新聞頁") #點擊新聞tab browser.find_by_xpath( f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click() timeSleepRandomly() newsDict = {} newsDictInner = {} while True: print(f"進行 {searchword} 第", firstPage, "頁") elementUrlExtract(browser, firstPage, topTabList, elementUrl, newsDictInner, searchword) judgment = judgeNextPage(browser, searchword) if judgment: print(f"『{searchword}』 仍有下一頁,繼續爬取!") firstPage += 1 pass else: browser.quit() break timeStamp = timeStampGenerator() newsTotalNum = len(newsDictInner) newsDict["dateTime"] = timeStamp newsDict["keyword"] = searchword newsDict["newsTotalNum"] = newsTotalNum newsDict["newsUrl"] = newsDictInner with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json", 'w', encoding='utf-8') as f: json.dump(newsDict, f, indent=2, ensure_ascii=False) print( f'{thisPID} 成功寫出 google_{timeStamp}_{newsTotalNum}_{searchword}.json ' ) input.task_done() end = timeCalculate() print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')