Exemplo n.º 1
0
def getPageInARow(url, searchword, firstPage, topTabList, elementUrl):

    browser = buildSplinterBrowser("chrome")
     
    browser.visit(url)
    browserWaitTime(browser)

    searchwordKeyInAndEnter(browser, searchword)
    browser.driver.set_window_size(1024,768)

    forSureNws = findOutNws(browser, topTabList)

    keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
    # 擬人化mouse_over要排除新聞tab
    topTabList.remove(int(keyNews))

    print(f"點擊 {keyNews} 去到 新聞頁")
    #點擊新聞tab
    browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
    timeSleepRandomly()

    newsDict = {}
    newsDictInner = {}
    while True:
        print(f"進行 {searchword} 第", firstPage, "頁")
        elementUrlExtract(browser, topTabList, elementUrl, newsDictInner)
        judgment = judgeNextPage(browser)
        if judgment:
            print("仍有下一頁,繼續爬取!")
            firstPage += 1
            pass
        else:
            browser.quit()
            break

    
    newsDict["dateTime"] = timeStampGenerator()
    newsDict["keyword"] = searchword
    newsDict["newsTotalNum"] = len(newsDictInner)
    newsDict["newsUrl"] = newsDictInner

    return newsDict
Exemplo n.º 2
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective):
    # begin = timeCalculate()
    thisPID = os.getpid()
    bureauMunging = bureauEnergyMunging()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get()

        dirNameCheck = dirRoute + f"{searchword}/"
        directory = dirRoute + f"{searchword}/detail/"
        dirNameWriteOut = dirRoute + f"{searchword}/jsonIntegration/"

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameWriteOut  + '」---------')
        print()


        mkdirForCleanData(objectiveFolderClean, objective)

        if not os.listdir(dirNameCheck):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        # 此區已經採用簡化的寫法,因此若洗資料都無問題,那麼就可以刪除了。
        # if searchword == "除濕機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailDehumidification(searchword, directory)
        # elif searchword == "無風管空氣調節機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailAirConditioner(searchword, directory)
        # elif searchword == "電冰箱":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailRefrigerator(searchword, directory)
        # elif searchword == "電熱水瓶":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailElectricWarmer(searchword, directory)
        # elif searchword == "溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDrinkMachine(searchword, directory)
        # elif searchword == "溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDispenser(searchword, directory)
        # elif searchword == "冰溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDrinkMachine(searchword, directory)
        # elif searchword == "冰溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDispenser(searchword, directory)
        # elif searchword == "貯備型電熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailStorageWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯爐":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasStove(searchword, directory)
        # elif searchword == "安定器內藏式螢光燈泡":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailCompactFluorescentLamp(searchword, directory)

        # '無風管空氣調節機', '除濕機', '電冰箱', '電熱水瓶', '溫熱型開飲機',
        # '溫熱型飲水機', '冰溫熱型開飲機', '冰溫熱型飲水機', '貯備型電熱水器' , '瓦斯熱水器', '瓦斯爐', '安定器內藏式螢光燈泡'
        bureauEnergyDetail, totalNums = bureauMunging.detailMungingEntry(searchword, directory)

        with open(dirNameWriteOut + f"{objective}_detail_{timeStampGenerator()}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyDetail, f, indent=2, ensure_ascii=False)

        # 找出 overviewJsonFile ,開始與detailJsonFile合併:
        overviewJsonFile = [overviewFile for overviewFile in os.listdir(dirNameWriteOut) if "bureauEnergy_overview" in overviewFile].pop()
        with open(dirNameWriteOut + overviewJsonFile)as f:
            bureauEnergyOverview = json.load(f)

        modelPool = [comparedValue['product_model'] for comparedValue in bureauEnergyDetail['productDetail']]
        modelPoolDict = { v: k  for k, v in enumerate(modelPool)}


        #打開overviewJson檔案,為每個產品增加欄位。  
        for jsonObject in bureauEnergyOverview['product']:
            index, test_report_of_energy_efficiency, benchmark, annual, labelUri = zipJsonObject(modelPoolDict, jsonObject['product_model'], bureauEnergyDetail)
            
            # print('正在處理索引值: '+str(index))
            jsonObject['test_report_of_energy_efficiency'] = test_report_of_energy_efficiency
            jsonObject['efficiency_benchmark'] = benchmark
            jsonObject['annual_power_consumption_degrees_dive_year'] = annual
            jsonObject['energy_efficiency_label_innerUri'] = labelUri
            # print('done '+str(index))

        # 新增欄位的Json檔案更新時間。
        timeStamp = timeStampGenerator()
        bureauEnergyOverview["dateTime"] = timeStamp
        
        with open(f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/{objective}_{timeStamp}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyOverview, f, indent=2, ensure_ascii=False)

        statistic.append(totalNums)

        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。
Exemplo n.º 3
0
def getPageInARow(input, url, firstPage, topTabList, elementUrl,
                  objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForRawData(objectiveFolder,
                        objective,
                        "google",
                        keyword=searchword)
        browser = buildSplinterBrowserHeadless("chrome")

        browser.visit(url)
        browserWaitTime(browser)

        searchwordKeyInAndEnter(browser, searchword)
        browser.driver.set_window_size(1024, 768)

        forSureNws = findOutNws(browser, topTabList)
        keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
        # 擬人化mouse_over要排除新聞tab
        topTabList.remove(int(keyNews))

        print(f"點擊 topTabList {keyNews} 去到 新聞頁")
        #點擊新聞tab
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
        timeSleepRandomly()

        newsDict = {}
        newsDictInner = {}
        while True:
            print(f"進行 {searchword} 第", firstPage, "頁")
            elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                              newsDictInner, searchword)
            judgment = judgeNextPage(browser, searchword)
            if judgment:
                print(f"『{searchword}』 仍有下一頁,繼續爬取!")
                firstPage += 1
                pass
            else:
                browser.quit()
                break

        timeStamp = timeStampGenerator()
        newsTotalNum = len(newsDictInner)
        newsDict["dateTime"] = timeStamp
        newsDict["keyword"] = searchword
        newsDict["newsTotalNum"] = newsTotalNum
        newsDict["newsUrl"] = newsDictInner

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json",
                'w',
                encoding='utf-8') as f:
            json.dump(newsDict, f, indent=2, ensure_ascii=False)
        print(
            f'{thisPID}  成功寫出  google_{timeStamp}_{newsTotalNum}_{searchword}.json '
        )

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
Exemplo n.º 4
0
                            readyLink[key] = newsTitle
                            targetNum = 0
                            break
        else:
            pass

    newsObjectWhole = {}
    newsObjectReadyForCrawling = {}
    for key in readyLink:
        try:
            newsObjectReadyForCrawling[key] = newsObject[key]
        except KeyError as e:
            newsObjectReadyForCrawling[key] = None
            print("擷取待爬取url出錯!", e)

    timeStamp = timeStampGenerator()
    totalNum = len(newsObjectReadyForCrawling)
    newsObjectWhole["dateTime"] = timeStamp
    newsObjectWhole["keyword"] = keyword
    newsObjectWhole["newsTotalNum"] = totalNum
    newsObjectWhole["newsUrl"] = newsObjectReadyForCrawling

    #-----------------------------------檢測判斷如何---------------------
    print("篩選出", len(readyLink), "則新聞。")
    # print(len(readyLinkComparison))
    # for row in readyLinkComparison:
    #     print(readyLinkComparison[row])

    newsTitleList = [
        newsObjectReadyForCrawling[key][0] + "\n"
        for key in newsObjectReadyForCrawling
Exemplo n.º 5
0
def dataMunging(input, output, dirRoute,objectiveFolder, objective, domainUrl, *args):
    thisPID = os.getpid()
    energyLabelUrl = "https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/"
    bureauReplace = bureauEnergyReplace()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get() 
        dirNameAccepted = dirRoute + f"{searchword}/overview/"
        dirNameWriteOut = dirRoute + f"{searchword}/"

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameAccepted + '」---------')
        print()
        
        if not os.listdir(dirNameAccepted):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        bureauEnergyDict = {}
        productArray= [] 
        
        for file in initialFileZeroUnderscoreInt(dirNameAccepted):
            # print(" start " + file + " ! ")
                
            with open(dirNameAccepted + file)as f:
                inn = f.read()
            
            # 處理soup=""的情況
            if not inn:
              continue
            
            textSoup = BeautifulSoup(inn,'html.parser')

            a = 0
            b = 7

            for i in range(10): #每頁有十項,每7個元素一組
                oneset = textSoup.find_all('div',{'class':'col-md-12 column'})[-1].find_all('td',{'align':'left'})[a:b]
                if oneset != []:
                    
                    detailUrl =  domainUrl + oneset[2].a.attrs.get('href')
                    
                    parseUrl = urlparse(detailUrl)
                    qsDict = parse_qs(parseUrl.query)
                    p1 = qsDict['id'].pop() #id是p1
                    p0 = qsDict['p0'].pop()
                    
                    productDict = {}
                    productDict['Id'] = p1 #oneset[2].a.attrs.get('href').split('id=')[1]
                    #  檔案裡面有髒值  冰箱"product_model": "B23KV-81RE\n", "IB 7030 F TW"     空調"product_model": "PAX-K500CLD ",
                    productDict['product_model'] = bureauReplace.productModel(oneset[0].text)
                    productDict['brand_name'] = oneset[1].text
                    productDict['login_number'] = oneset[2].text
                    productDict['detailUri'] = detailUrl
                    productDict['labeling_company'] = oneset[3].text
                    productDict['efficiency_rating'] = oneset[4].text
                    productDict['from_date_of_expiration'] = bureauReplace.date(oneset[5].text)
                    
                    # 我們可以組裝outerUri
                    # https://ranking.energylabel.org.tw/product/Approval/file_list.aspx?p1=20901&p0=82409
                    productDict['energy_efficiency_label_outerUri'] = f"{domainUrl}file_list.aspx?p1={p1}&p0={p0}"
                    
                    # 我們想要的InnerUri
                    # https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/20901/SB_photo1/EF2R-13DEX1.jpg
                    # productDict['energy_efficiency_label_innerUri'] = ... 因為這邊要做判斷,因此在 「bureauEnergyMunging.py」再處理,以不影響爬蟲的進度。


                    productArray.append(productDict)

                    a += 7
                    b += 7
                    # print('done ' + file + ' 的第' + str(i+1) + '份。')
                else:
                    print('在 ' + file + ' 的第' + str(i+1) + '處,發現空值!')
                    break
            
        bureauEnergyDict['product'] = productArray
        bureauEnergyDict['keyword'] = searchword
        timeStamp = timeStampGenerator()
        bureauEnergyDict["dateTime"] = timeStamp

        totalNums = len(bureauEnergyDict['product'])
        
        with open(dirNameWriteOut + f"jsonIntegration/{objective}_overview_{timeStamp}_{totalNums}_{searchword}.json","w",encoding="utf-8")as f:
            json.dump(bureauEnergyDict, f, indent=2, ensure_ascii=False)
        
        print(f'這裡是 dataMunging ,處理{searchword}完成: ' + dirNameWriteOut + "jsonIntegration/")


        # ========= 如果只想要洗 overview html,此區可以註解掉。==========
        # 莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="detail")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="detail")
        
        productIndex = 1
        for file in bureauEnergyDict['product']:
            detailUri = file['detailUri']
            readyTxtFileRoute = dirNameWriteOut + f"detail/{productIndex}_{totalNums}_{searchword}.txt"
            
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + detailUri + "+" + readyTxtFileRoute

            output.put(consecutiveData)
            # print('這裡是 dataMunging,準備送給  detailPageInARow  處理: ' + consecutiveData)
            # print()            
            productIndex += 1
        # ========= ================================



        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
        timeSleepOne() #暫停幾秒來模擬現實狀況。
Exemplo n.º 6
0
def dataMunging(fileRouteGenerator):
    weatherDictOutter = {}
    weatherArrayOutter = []

    for fileRoute in fileRouteGenerator:
        with open(fileRoute) as f:
            inn = f.read()
        if not inn:
            print("沒有天氣資料可以清洗。。。。。。\n" * 10)
            continue

        textSoup = BeautifulSoup(inn, "html.parser")

        month, year = weatherRecord.generateDateFromFileRoute(fileRoute)
        keyword = f"{year}-{month}"

        weatherDict = {}
        weatherRecordArray = []
        everyStationData = textSoup.select("tr")
        for row in everyStationData:
            weatherRecordDict = {}
            try:
                weatherRecordDict[
                    "stationName"] = weatherRecord.selectStationName(row)
                weatherRecordDict[
                    "temperatureAverage"] = weatherRecord.selectColumn(row, 0)
                weatherRecordDict[
                    "temperatureHigh"] = numsHandler.searchFloatNums(
                        weatherRecord.selectColumn(row, 1))
                weatherRecordDict[
                    "temperatureHighDate"] = weatherRecord.composeDate(
                        year, month, row, 1)
                weatherRecordDict[
                    "temperatureLow"] = numsHandler.searchFloatNums(
                        weatherRecord.selectColumn(row, 2))
                weatherRecordDict[
                    "temperatureLowDate"] = weatherRecord.composeDate(
                        year, month, row, 2)
                weatherRecordDict[
                    "relativeHumidityAverage"] = weatherRecord.selectColumn(
                        row, 6)
                weatherRecordDict[
                    "relativeHumidityLow"] = numsHandler.searchFloatNums(
                        weatherRecord.selectColumn(row, 7))
                weatherRecordDict[
                    "relativeHumidityLowDate"] = weatherRecord.composeDate(
                        year, month, row, 7)
                weatherRecordDict["rainful"] = weatherRecord.selectColumn(
                    row, 3)
                weatherRecordDict["rainingDays"] = weatherRecord.selectColumn(
                    row, 9)

                weatherRecordArray.append(weatherRecordDict)
            except AttributeError as e:
                print("error code:", e)
                print(
                    f"{keyword}   {weatherRecord.selectStationName(row)}   沒有資料!"
                )
                print()
                weatherRecordDict[
                    "stationName"] = weatherRecord.selectStationName(row)
                weatherRecordDict["temperatureAverage"] = "0.0"
                weatherRecordDict["temperatureHigh"] = "0.0"
                weatherRecordDict["temperatureHighDate"] = "1970-01-01"
                weatherRecordDict["temperatureLow"] = "0.0"
                weatherRecordDict["temperatureLowDate"] = "1970-01-01"
                weatherRecordDict["relativeHumidityAverage"] = "0"
                weatherRecordDict["relativeHumidityLow"] = "0"
                weatherRecordDict["relativeHumidityLowDate"] = "1970-01-01"
                weatherRecordDict["rainful"] = "0.0"
                weatherRecordDict["rainingDays"] = "0"

                weatherRecordArray.append(weatherRecordDict)

        #取完一份txt的資料了,進行整裝
        weatherDict["keyword"] = keyword
        weatherDict["records"] = weatherRecordArray
        weatherArrayOutter.append(weatherDict)

        print(f"===========清洗完成  {keyword} =============")

    dateTime = timeStampGenerator()
    weatherDictOutter["latestDate"] = keyword
    weatherDictOutter["dateTime"] = dateTime
    weatherDictOutter["recordsArray"] = weatherArrayOutter
    with open(
            weatherRecord._dirRouteMungingClean + weatherRecord._weather +
            f"/weather_{dateTime}_{keyword}.json", 'w') as f:
        json.dump(weatherDictOutter, f, indent=2, ensure_ascii=False)