def overviewUriDistributor(input, output, keywordUrlPair, headers, dirRoute, objectiveFolder, objective, *args): thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() url = keywordUrlPair[searchword] totalPage = getPageFirst(url + "1", headers) print('overviewUriDistributor is in new process %s, %s ' % (overviewUriDistributor_proc, os.getpid())) print('------接下來要發送 ' + searchword + ' 的overviewUri---------', '共', totalPage, '頁') #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。 eraseRawData(objectiveFolder, objective, searchword, keyword="overview") mkdirForRawData(objectiveFolder, objective, searchword, keyword="overview") for page in range(1, int(totalPage) + 1): correctUrl = url + str(page) readyTxtFileRoute = dirRoute + f"{searchword}/overview/{page}_{totalPage}_{searchword}.txt" #TypeError: must be str, not tuple consecutiveData = searchword + "+" + correctUrl + "+" + readyTxtFileRoute output.put(consecutiveData) print( f'這裡是 overviewUriDistributor_{thisPID},準備送給 getPageInARow 處理 {totalPage} 頁的 overviewUri' ) print() end = timeCalculate() print('overviewUriDistributor 累計耗時:{0} 秒'.format(end - begin)) input.task_done() #通知main process此次的input處理完成! timeSleepOne() #暫停幾秒來模擬現實狀況。
def distributeMonthAvailable(input, output, _weatherRecordAvailable, objectiveFolder, objective, *args): begin = timeCalculate() thisPID = os.getpid() while True: print(thisPID, "===========================================") year = input.get() monthsAvailable = _weatherRecordAvailable[year] eraseRawData(objectiveFolder, objective, year) mkdirForRawData(objectiveFolder, objective, year) for month in monthsAvailable: consecutiveData = year + "+" + month output.put(consecutiveData) print( f'這裡是distributeMonthAvailable,準備送給 getPageInARow 處理: {year}年_{month}月 ' ) input.task_done() end = timeCalculate() print(f'{thisPID}_distributeMonthAvailable 累計耗時:{end-begin} 秒') timeSleepOne()
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective): thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, thisPID)) print() eraseRawData(objectiveFolder, objective, searchword) mkdirForRawData(objectiveFolder, objective, searchword) url = keywordUrlPair[searchword] # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。 for i in range(3): try: timeSleepOne() timeSleepRandomly() browser = buildSplinterBrowserHeadless('chrome') timeSleepRandomly() browser.visit(url) browserWaitTime(browser) timeSleepTwo() tempHtml = browser.html timeSleepRandomly() soup = BeautifulSoup(tempHtml, 'html.parser') print( f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!" ) break except (ConnectionRefusedError, TimeoutException, WebDriverException) as e: print( f"{thisPID}__{getPageInARow_proc} 讀取{searchword}第 1 頁有問題。", e) print( f"{thisPID}__{getPageInARow_proc} 重建browser物件,進行再處理 {i} 次!" ) timeSleepFour() timeSleepRandomly() soup = "" # else: # print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!") try: totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text), 30) except AttributeError as e: print("getPageInARow 出錯", e) # 讓程式強制停下來 raise print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁') print() with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt", 'w', encoding='utf-8') as f: f.write(str(soup)) print() print(f'成功寫出 {searchword} 第 1 頁') i_browser = 1 try: browser.quit() print( f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++" ) except: print( f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。") i_browser += 1 print( f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) os.kill(thisPID, signal.SIGKILL) # 休息久一點,讓所有searchword的第一頁都有被讀到。 timeSleepEight() timeSleepEight() for num in range(2, totalPage + 1): strNum = str(num) consecutiveData = searchword + "+" + strNum + "+" + str( totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}", url) output.put(consecutiveData) # print(f'這裡是getPageInARow,準備送給 getPageInARowAdvanced 處理: {searchword} 的 第 {strNum} 頁,總共{totalPage}') print() input.task_done() #通知main process此次的input處理完成! end = timeCalculate() print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
end = timeCalculate() print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒') if __name__ == '__main__': objectiveFolder = "rawData" objective = "momo" begin = timeCalculate() print('start in main process %s' % os.getpid()) eraseRawData(objectiveFolder, objective, "badRequest") mkdirForRawData(objectiveFolder, objective, "badRequest") print( '-------------------------------------------------------------------------' ) # 共同佇列宣告 searchword_queue = mp.JoinableQueue() url_queue = mp.JoinableQueue() # 啟動進程 Process_1 = [] for p in range(4): getPageInARow_proc = mp.Process(target=getPageInARow, args=( searchword_queue, url_queue,
def getPageInARow(input, url, firstPage, topTabList, elementUrl, objectiveFolder, objective, *args): begin = timeCalculate() thisPID = os.getpid() while True: print(thisPID, "===========================================") searchword = input.get() mkdirForRawData(objectiveFolder, objective, "google", keyword=searchword) browser = buildSplinterBrowserHeadless("chrome") browser.visit(url) browserWaitTime(browser) searchwordKeyInAndEnter(browser, searchword) browser.driver.set_window_size(1024, 768) forSureNws = findOutNws(browser, topTabList) keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop() # 擬人化mouse_over要排除新聞tab topTabList.remove(int(keyNews)) print(f"點擊 topTabList {keyNews} 去到 新聞頁") #點擊新聞tab browser.find_by_xpath( f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click() timeSleepRandomly() newsDict = {} newsDictInner = {} while True: print(f"進行 {searchword} 第", firstPage, "頁") elementUrlExtract(browser, firstPage, topTabList, elementUrl, newsDictInner, searchword) judgment = judgeNextPage(browser, searchword) if judgment: print(f"『{searchword}』 仍有下一頁,繼續爬取!") firstPage += 1 pass else: browser.quit() break timeStamp = timeStampGenerator() newsTotalNum = len(newsDictInner) newsDict["dateTime"] = timeStamp newsDict["keyword"] = searchword newsDict["newsTotalNum"] = newsTotalNum newsDict["newsUrl"] = newsDictInner with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json", 'w', encoding='utf-8') as f: json.dump(newsDict, f, indent=2, ensure_ascii=False) print( f'{thisPID} 成功寫出 google_{timeStamp}_{newsTotalNum}_{searchword}.json ' ) input.task_done() end = timeCalculate() print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
def dataMunging(input, output, dirRoute,objectiveFolder, objective, domainUrl, *args): thisPID = os.getpid() energyLabelUrl = "https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/" bureauReplace = bureauEnergyReplace() while True: print(thisPID,"===========================================") searchword = input.get() dirNameAccepted = dirRoute + f"{searchword}/overview/" dirNameWriteOut = dirRoute + f"{searchword}/" #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。 eraseRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration") mkdirForRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration") print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID)) print() print('------接下來要處理資料夾路徑「 ' + dirNameAccepted + '」---------') print() if not os.listdir(dirNameAccepted): print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============") input.task_done() timeSleepOne() print("========================break========================") break bureauEnergyDict = {} productArray= [] for file in initialFileZeroUnderscoreInt(dirNameAccepted): # print(" start " + file + " ! ") with open(dirNameAccepted + file)as f: inn = f.read() # 處理soup=""的情況 if not inn: continue textSoup = BeautifulSoup(inn,'html.parser') a = 0 b = 7 for i in range(10): #每頁有十項,每7個元素一組 oneset = textSoup.find_all('div',{'class':'col-md-12 column'})[-1].find_all('td',{'align':'left'})[a:b] if oneset != []: detailUrl = domainUrl + oneset[2].a.attrs.get('href') parseUrl = urlparse(detailUrl) qsDict = parse_qs(parseUrl.query) p1 = qsDict['id'].pop() #id是p1 p0 = qsDict['p0'].pop() productDict = {} productDict['Id'] = p1 #oneset[2].a.attrs.get('href').split('id=')[1] # 檔案裡面有髒值 冰箱"product_model": "B23KV-81RE\n", "IB 7030 F TW" 空調"product_model": "PAX-K500CLD ", productDict['product_model'] = bureauReplace.productModel(oneset[0].text) productDict['brand_name'] = oneset[1].text productDict['login_number'] = oneset[2].text productDict['detailUri'] = detailUrl productDict['labeling_company'] = oneset[3].text productDict['efficiency_rating'] = oneset[4].text productDict['from_date_of_expiration'] = bureauReplace.date(oneset[5].text) # 我們可以組裝outerUri # https://ranking.energylabel.org.tw/product/Approval/file_list.aspx?p1=20901&p0=82409 productDict['energy_efficiency_label_outerUri'] = f"{domainUrl}file_list.aspx?p1={p1}&p0={p0}" # 我們想要的InnerUri # https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/20901/SB_photo1/EF2R-13DEX1.jpg # productDict['energy_efficiency_label_innerUri'] = ... 因為這邊要做判斷,因此在 「bureauEnergyMunging.py」再處理,以不影響爬蟲的進度。 productArray.append(productDict) a += 7 b += 7 # print('done ' + file + ' 的第' + str(i+1) + '份。') else: print('在 ' + file + ' 的第' + str(i+1) + '處,發現空值!') break bureauEnergyDict['product'] = productArray bureauEnergyDict['keyword'] = searchword timeStamp = timeStampGenerator() bureauEnergyDict["dateTime"] = timeStamp totalNums = len(bureauEnergyDict['product']) with open(dirNameWriteOut + f"jsonIntegration/{objective}_overview_{timeStamp}_{totalNums}_{searchword}.json","w",encoding="utf-8")as f: json.dump(bureauEnergyDict, f, indent=2, ensure_ascii=False) print(f'這裡是 dataMunging ,處理{searchword}完成: ' + dirNameWriteOut + "jsonIntegration/") # ========= 如果只想要洗 overview html,此區可以註解掉。========== # 莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。 eraseRawData(objectiveFolder, objective, searchword, keyword="detail") mkdirForRawData(objectiveFolder, objective, searchword, keyword="detail") productIndex = 1 for file in bureauEnergyDict['product']: detailUri = file['detailUri'] readyTxtFileRoute = dirNameWriteOut + f"detail/{productIndex}_{totalNums}_{searchword}.txt" #TypeError: must be str, not tuple consecutiveData = searchword + "+" + detailUri + "+" + readyTxtFileRoute output.put(consecutiveData) # print('這裡是 dataMunging,準備送給 detailPageInARow 處理: ' + consecutiveData) # print() productIndex += 1 # ========= ================================ end = timeCalculate() print('dataMunging 累計耗時:{0} 秒'.format(end-begin)) input.task_done() timeSleepOne() #暫停幾秒來模擬現實狀況。
browser = buildSplinterBrowser("chrome") browserWaitTime(browser) browser.visit("http://e-service.cwb.gov.tw/HistoryDataQuery/") #等待地圖的JS出來 browser.is_element_present_by_xpath('//*[@id="con_r"]/div/div[1]', wait_time=5) soup = BeautifulSoup(browser.html, "html.parser") browser.quit() print("==============================quit==============================") eraseRawData(objectiveFolder, objective, "overviewData") mkdirForRawData(objectiveFolder, objective, "overviewData") timeStamp = timeStampGenerator() with open( f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/overviewData/observation_{timeStamp}.txt", 'w', encoding='utf-8') as f: f.write(str(soup)) print(f"成功寫出 observation_{timeStamp}.txt") end = timeCalculate() print('完成!一共耗時:{0} 秒'.format(end - begin))
if __name__ == '__main__': objectiveFolder = "rawData" objective = "news" searchword = "家電促銷" firstPage = 1 topTabList = [row for row in range(2,6)] #新聞、圖片、地圖、影片、[更多]__xpath不同,無法準確mouse_over() elementUrl = [row for row in range(1,11)] #一頁有10個標的 url = "https://www.google.com/" begin = timeCalculate() mkdirForRawData(objectiveFolder, objective, "google", keyword=searchword) newsDicT = getPageInARow(url, searchword, firstPage, topTabList, elementUrl) timeStamp = newsDicT["dateTime"] newsTotalNum = newsDicT["newsTotalNum"] with open(f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json", 'w', encoding='utf-8')as f: json.dump(newsDicT, f, indent=2, ensure_ascii=False) print("寫出!") end = timeCalculate() print('完成!一共耗時:{0} 秒'.format(end-begin))
from libs.timeWidget import timeCalculate from libs.timeWidget import timeStampGenerator if __name__ == '__main__': objectiveFolder = "rawData" objective = "news" dirRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google" dirRouteWriteOut = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/newsIntegration" begin = timeCalculate() # eraseRawData(objectiveFolder, objective, "newsIntegration") mkdirForRawData(objectiveFolder, objective, "newsIntegration") dirRouteToFiles = listSecondDirBelowFiles(dirRoute) newsDict = {} newsDictInner = {} for file in dirRouteToFiles: with open(file) as f: inn = json.load(f) newsDictInner.update(inn['newsUrl']) timeStamp = timeStampGenerator() newsTotalNum = len(newsDictInner) allSearchword = "^".join([row for row in _googleSearchWord]) newsDict["dateTime"] = timeStamp newsDict["keyword"] = allSearchword
def getPageInARow(input, headers, objectiveFolder, objective, *args): begin = time.time() thisPID = os.getpid() while True: print(thisPID,"===========================================") searchwordAndKeyword = input.get() searchword, keyword = searchwordAndKeyword.split("+") print("getPageInARow is in new process %s, %s " % (getPageInARow_proc, thisPID)) eraseRawData(objectiveFolder, objective, searchword, keyword=keyword) mkdirForRawData(objectiveFolder, objective, searchword, keyword=keyword) totalPage, totalRows = getPageFirst(searchword, keyword, headers) try: totalPagePlusOne = totalPage+1 except TypeError as e: print("getPageFirst 出錯", e) raise print(f"關鍵字 {searchword} 從{keyword}上取得資料,一共有 {totalPage} 頁, {totalRows}筆。") for page in range(1, totalPagePlusOne): url = 'https://ecshweb.pchome.com.tw/search/v3.3/{0}/results?q={1}&page={2}&sort=sale/dc'.format(keyword, searchword, page) for i in range(4): try: timeSleepRandomly() res = requests.get(url, headers=headers) res.encoding = 'utf-8' timeSleepRandomly() jsonPage = json.loads(res.text) timeSleepEight() timeSleepRandomly() break except (JSONDecodeError, ConnectionRefusedError) as e:#拜訪太密集的話,pchome回傳的json檔案格式就不會是正常的格式,因此會發生無法json反序列化的例外。 print(f"getPageInARow這裡發生錯誤 {keyword}_{searchword}_{page} "+str(e)+"正在處理中。") timeSleepEight() timeSleepRandomly() jsonPage = "" except requests.exceptions.ConnectionError as e: print(f"getPageInARow這裡發生錯誤 {keyword}_{searchword}_{page} "+str(e)+"正在處理中。") timeSleepEight() timeSleepRandomly() jsonPage = "" if not jsonPage: badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest" with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a", newline='', encoding='utf-8')as f: # newline沒作用... errorMessage = url + "\n" f.write(errorMessage) #writelines作用在errorMessage是list時 with open(f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{keyword}/{page}_{totalPage}_{totalRows}_{keyword+searchword}.json", 'w', encoding='utf-8')as f: json.dump(jsonPage, f, indent=2, ensure_ascii=False) print("成功寫出 {0} 第 {1} 頁,共 {2} 頁".format(keyword+searchword, page, totalPage)) print(f"這裡是getPageInARow_{thisPID},準備完成{keyword}_{searchword}工作。 ") print() end = time.time() print('getPageInARow 累計耗時:{0} 秒'.format(end-begin)) input.task_done() #通知main process此次的input處理完成! timeSleepOne() #暫停幾秒來模擬現實狀況。