예제 #1
0
def overviewUriDistributor(input, output, keywordUrlPair, headers, dirRoute,
                           objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        url = keywordUrlPair[searchword]
        totalPage = getPageFirst(url + "1", headers)

        print('overviewUriDistributor is in new process %s, %s ' %
              (overviewUriDistributor_proc, os.getpid()))
        print('------接下來要發送 ' + searchword + ' 的overviewUri---------', '共',
              totalPage, '頁')

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder,
                     objective,
                     searchword,
                     keyword="overview")
        mkdirForRawData(objectiveFolder,
                        objective,
                        searchword,
                        keyword="overview")

        for page in range(1, int(totalPage) + 1):
            correctUrl = url + str(page)

            readyTxtFileRoute = dirRoute + f"{searchword}/overview/{page}_{totalPage}_{searchword}.txt"
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + correctUrl + "+" + readyTxtFileRoute

            output.put(consecutiveData)
        print(
            f'這裡是 overviewUriDistributor_{thisPID},準備送給  getPageInARow  處理 {totalPage} 頁的 overviewUri'
        )
        print()

        end = timeCalculate()
        print('overviewUriDistributor 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
예제 #2
0
def distributeMonthAvailable(input, output, _weatherRecordAvailable,
                             objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        year = input.get()
        monthsAvailable = _weatherRecordAvailable[year]

        eraseRawData(objectiveFolder, objective, year)
        mkdirForRawData(objectiveFolder, objective, year)

        for month in monthsAvailable:
            consecutiveData = year + "+" + month
            output.put(consecutiveData)
            print(
                f'這裡是distributeMonthAvailable,準備送給  getPageInARow  處理: {year}年_{month}月 '
            )
        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_distributeMonthAvailable 累計耗時:{end-begin} 秒')
        timeSleepOne()
예제 #3
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
예제 #4
0
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')


if __name__ == '__main__':

    objectiveFolder = "rawData"

    objective = "momo"

    begin = timeCalculate()

    print('start in main process %s' % os.getpid())

    eraseRawData(objectiveFolder, objective, "badRequest")
    mkdirForRawData(objectiveFolder, objective, "badRequest")
    print(
        '-------------------------------------------------------------------------'
    )

    # 共同佇列宣告
    searchword_queue = mp.JoinableQueue()
    url_queue = mp.JoinableQueue()

    # 啟動進程
    Process_1 = []
    for p in range(4):
        getPageInARow_proc = mp.Process(target=getPageInARow,
                                        args=(
                                            searchword_queue,
                                            url_queue,
예제 #5
0
def getPageInARow(input, url, firstPage, topTabList, elementUrl,
                  objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForRawData(objectiveFolder,
                        objective,
                        "google",
                        keyword=searchword)
        browser = buildSplinterBrowserHeadless("chrome")

        browser.visit(url)
        browserWaitTime(browser)

        searchwordKeyInAndEnter(browser, searchword)
        browser.driver.set_window_size(1024, 768)

        forSureNws = findOutNws(browser, topTabList)
        keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
        # 擬人化mouse_over要排除新聞tab
        topTabList.remove(int(keyNews))

        print(f"點擊 topTabList {keyNews} 去到 新聞頁")
        #點擊新聞tab
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
        timeSleepRandomly()

        newsDict = {}
        newsDictInner = {}
        while True:
            print(f"進行 {searchword} 第", firstPage, "頁")
            elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                              newsDictInner, searchword)
            judgment = judgeNextPage(browser, searchword)
            if judgment:
                print(f"『{searchword}』 仍有下一頁,繼續爬取!")
                firstPage += 1
                pass
            else:
                browser.quit()
                break

        timeStamp = timeStampGenerator()
        newsTotalNum = len(newsDictInner)
        newsDict["dateTime"] = timeStamp
        newsDict["keyword"] = searchword
        newsDict["newsTotalNum"] = newsTotalNum
        newsDict["newsUrl"] = newsDictInner

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json",
                'w',
                encoding='utf-8') as f:
            json.dump(newsDict, f, indent=2, ensure_ascii=False)
        print(
            f'{thisPID}  成功寫出  google_{timeStamp}_{newsTotalNum}_{searchword}.json '
        )

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
예제 #6
0
def dataMunging(input, output, dirRoute,objectiveFolder, objective, domainUrl, *args):
    thisPID = os.getpid()
    energyLabelUrl = "https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/"
    bureauReplace = bureauEnergyReplace()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get() 
        dirNameAccepted = dirRoute + f"{searchword}/overview/"
        dirNameWriteOut = dirRoute + f"{searchword}/"

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameAccepted + '」---------')
        print()
        
        if not os.listdir(dirNameAccepted):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        bureauEnergyDict = {}
        productArray= [] 
        
        for file in initialFileZeroUnderscoreInt(dirNameAccepted):
            # print(" start " + file + " ! ")
                
            with open(dirNameAccepted + file)as f:
                inn = f.read()
            
            # 處理soup=""的情況
            if not inn:
              continue
            
            textSoup = BeautifulSoup(inn,'html.parser')

            a = 0
            b = 7

            for i in range(10): #每頁有十項,每7個元素一組
                oneset = textSoup.find_all('div',{'class':'col-md-12 column'})[-1].find_all('td',{'align':'left'})[a:b]
                if oneset != []:
                    
                    detailUrl =  domainUrl + oneset[2].a.attrs.get('href')
                    
                    parseUrl = urlparse(detailUrl)
                    qsDict = parse_qs(parseUrl.query)
                    p1 = qsDict['id'].pop() #id是p1
                    p0 = qsDict['p0'].pop()
                    
                    productDict = {}
                    productDict['Id'] = p1 #oneset[2].a.attrs.get('href').split('id=')[1]
                    #  檔案裡面有髒值  冰箱"product_model": "B23KV-81RE\n", "IB 7030 F TW"     空調"product_model": "PAX-K500CLD ",
                    productDict['product_model'] = bureauReplace.productModel(oneset[0].text)
                    productDict['brand_name'] = oneset[1].text
                    productDict['login_number'] = oneset[2].text
                    productDict['detailUri'] = detailUrl
                    productDict['labeling_company'] = oneset[3].text
                    productDict['efficiency_rating'] = oneset[4].text
                    productDict['from_date_of_expiration'] = bureauReplace.date(oneset[5].text)
                    
                    # 我們可以組裝outerUri
                    # https://ranking.energylabel.org.tw/product/Approval/file_list.aspx?p1=20901&p0=82409
                    productDict['energy_efficiency_label_outerUri'] = f"{domainUrl}file_list.aspx?p1={p1}&p0={p0}"
                    
                    # 我們想要的InnerUri
                    # https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/20901/SB_photo1/EF2R-13DEX1.jpg
                    # productDict['energy_efficiency_label_innerUri'] = ... 因為這邊要做判斷,因此在 「bureauEnergyMunging.py」再處理,以不影響爬蟲的進度。


                    productArray.append(productDict)

                    a += 7
                    b += 7
                    # print('done ' + file + ' 的第' + str(i+1) + '份。')
                else:
                    print('在 ' + file + ' 的第' + str(i+1) + '處,發現空值!')
                    break
            
        bureauEnergyDict['product'] = productArray
        bureauEnergyDict['keyword'] = searchword
        timeStamp = timeStampGenerator()
        bureauEnergyDict["dateTime"] = timeStamp

        totalNums = len(bureauEnergyDict['product'])
        
        with open(dirNameWriteOut + f"jsonIntegration/{objective}_overview_{timeStamp}_{totalNums}_{searchword}.json","w",encoding="utf-8")as f:
            json.dump(bureauEnergyDict, f, indent=2, ensure_ascii=False)
        
        print(f'這裡是 dataMunging ,處理{searchword}完成: ' + dirNameWriteOut + "jsonIntegration/")


        # ========= 如果只想要洗 overview html,此區可以註解掉。==========
        # 莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="detail")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="detail")
        
        productIndex = 1
        for file in bureauEnergyDict['product']:
            detailUri = file['detailUri']
            readyTxtFileRoute = dirNameWriteOut + f"detail/{productIndex}_{totalNums}_{searchword}.txt"
            
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + detailUri + "+" + readyTxtFileRoute

            output.put(consecutiveData)
            # print('這裡是 dataMunging,準備送給  detailPageInARow  處理: ' + consecutiveData)
            # print()            
            productIndex += 1
        # ========= ================================



        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
        timeSleepOne() #暫停幾秒來模擬現實狀況。
    browser = buildSplinterBrowser("chrome")
    browserWaitTime(browser)

    browser.visit("http://e-service.cwb.gov.tw/HistoryDataQuery/")

    #等待地圖的JS出來
    browser.is_element_present_by_xpath('//*[@id="con_r"]/div/div[1]',
                                        wait_time=5)

    soup = BeautifulSoup(browser.html, "html.parser")

    browser.quit()
    print("==============================quit==============================")

    eraseRawData(objectiveFolder, objective, "overviewData")
    mkdirForRawData(objectiveFolder, objective, "overviewData")

    timeStamp = timeStampGenerator()

    with open(
            f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/overviewData/observation_{timeStamp}.txt",
            'w',
            encoding='utf-8') as f:
        f.write(str(soup))

    print(f"成功寫出  observation_{timeStamp}.txt")

    end = timeCalculate()

    print('完成!一共耗時:{0} 秒'.format(end - begin))
예제 #8
0
   


if __name__ == '__main__':

    objectiveFolder = "rawData"
    objective = "news"

    searchword = "家電促銷"
    firstPage = 1

    topTabList = [row for row in range(2,6)] #新聞、圖片、地圖、影片、[更多]__xpath不同,無法準確mouse_over()
    elementUrl = [row for row in range(1,11)] #一頁有10個標的

    url = "https://www.google.com/"
    
    begin = timeCalculate()

    mkdirForRawData(objectiveFolder, objective, "google", keyword=searchword)

    newsDicT = getPageInARow(url, searchword, firstPage, topTabList, elementUrl)

    timeStamp = newsDicT["dateTime"]
    newsTotalNum = newsDicT["newsTotalNum"]
    with open(f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json", 'w', encoding='utf-8')as f:
        json.dump(newsDicT, f, indent=2, ensure_ascii=False)
    print("寫出!")

    end = timeCalculate()
    
    print('完成!一共耗時:{0} 秒'.format(end-begin))
예제 #9
0
from libs.timeWidget import timeCalculate
from libs.timeWidget import timeStampGenerator

if __name__ == '__main__':

    objectiveFolder = "rawData"

    objective = "news"

    dirRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google"
    dirRouteWriteOut = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/newsIntegration"

    begin = timeCalculate()

    # eraseRawData(objectiveFolder, objective, "newsIntegration")
    mkdirForRawData(objectiveFolder, objective, "newsIntegration")

    dirRouteToFiles = listSecondDirBelowFiles(dirRoute)

    newsDict = {}
    newsDictInner = {}
    for file in dirRouteToFiles:
        with open(file) as f:
            inn = json.load(f)
        newsDictInner.update(inn['newsUrl'])

    timeStamp = timeStampGenerator()
    newsTotalNum = len(newsDictInner)
    allSearchword = "^".join([row for row in _googleSearchWord])
    newsDict["dateTime"] = timeStamp
    newsDict["keyword"] = allSearchword
예제 #10
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    begin = time.time()
    thisPID = os.getpid()
    while True:
        print(thisPID,"===========================================")
        searchwordAndKeyword = input.get()
        searchword, keyword = searchwordAndKeyword.split("+")

        print("getPageInARow is in new process %s, %s " % (getPageInARow_proc, thisPID))
        eraseRawData(objectiveFolder, objective, searchword, keyword=keyword)
        mkdirForRawData(objectiveFolder, objective, searchword, keyword=keyword)

        totalPage, totalRows = getPageFirst(searchword, keyword, headers)
        try:
            totalPagePlusOne = totalPage+1
        except TypeError as e:
            print("getPageFirst 出錯", e)
            raise


        print(f"關鍵字 {searchword} 從{keyword}上取得資料,一共有 {totalPage} 頁, {totalRows}筆。")

        for page in range(1, totalPagePlusOne):
            url = 'https://ecshweb.pchome.com.tw/search/v3.3/{0}/results?q={1}&page={2}&sort=sale/dc'.format(keyword, searchword, page)
            
            for i in range(4):
                try:
                    timeSleepRandomly()
                    res = requests.get(url, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    jsonPage = json.loads(res.text)
                    timeSleepEight()
                    timeSleepRandomly()
                    break
                except (JSONDecodeError, ConnectionRefusedError) as e:#拜訪太密集的話,pchome回傳的json檔案格式就不會是正常的格式,因此會發生無法json反序列化的例外。
                    print(f"getPageInARow這裡發生錯誤  {keyword}_{searchword}_{page} "+str(e)+"正在處理中。")
                    timeSleepEight()
                    timeSleepRandomly()
                    jsonPage = ""
                except requests.exceptions.ConnectionError as e:
                    print(f"getPageInARow這裡發生錯誤  {keyword}_{searchword}_{page} "+str(e)+"正在處理中。")
                    timeSleepEight()
                    timeSleepRandomly()
                    jsonPage = ""

            if not jsonPage:
                badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
                with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a",  newline='', encoding='utf-8')as f: # newline沒作用...
                    errorMessage = url + "\n"
                    f.write(errorMessage)   #writelines作用在errorMessage是list時
            with open(f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{keyword}/{page}_{totalPage}_{totalRows}_{keyword+searchword}.json", 'w', encoding='utf-8')as f:
                json.dump(jsonPage, f, indent=2, ensure_ascii=False)
            print("成功寫出  {0}  第 {1} 頁,共 {2} 頁".format(keyword+searchword, page, totalPage))

        print(f"這裡是getPageInARow_{thisPID},準備完成{keyword}_{searchword}工作。 ")
        print()
        end = time.time()
        print('getPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。