Пример #1
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text))
                    for row in soup.find("article").stripped_strings
                    if row != "" and not "googletag.cmd.push" in row
                    and not "function" in row
                ]
                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #2
0
def getPageInARaw(input, _headers, objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        year, month = consecutiveUrl.split("+")

        url = f"https://www.cwb.gov.tw/V8/C/C/Statistics/MonthlyData/MOD/{year}_{month}.html"
        res = requests.get(url, headers=_headers)
        timeSleepRandomly()
        res.encoding = "utf-8"

        soup = BeautifulSoup(res.text, 'html.parser')

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{year}/{month}_{year}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {month}_{year}.txt ')

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
        timeSleepOne()
Пример #3
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    row for row in soup.select_one(".story").stripped_strings
                ]

                #內文影片
                if soup.p.iframe:  #.attrs.get("src"):
                    videoLinkInContent = soup.p.iframe.attrs.get("src")
                    print("ETtoday 發現內文有影片:", videoLinkInContent)

                else:
                    videoLinkInContent = None

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #4
0
def searchwordKeyInAndEnter(browser, searchword):
    # 輸入匡輸入
    browser.find_by_xpath('//*[@id="tsf"]/div[2]/div/div[1]/div/div[1]/input').fill(searchword)
    timeSleepOne()
    # enter
    browser.find_by_xpath('//*[@id="tsf"]/div[2]/div/div[2]/div[2]/div/center/input[1]').click()
    timeSleepRandomly()
Пример #5
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text))
                    for row in soup.select_one(".article-body").select("p")
                    if row.text != ""
                ]
                videoLinkInContent = None  # 內文本身沒有影片

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #6
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        timeSleepOne()
        timeSleepRandomly()
        res = requests.get(correctUrl, headers=headers)
        res.encoding = 'utf-8'

        timeSleepRandomly()

        soup = BeautifulSoup(res.text, 'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        print('getPageInARow 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
Пример #7
0
def searchwordKeyInAndEnter(browser, searchword):
    # 輸入匡輸入
    # //*[@id="tsf"]/div[2]/div/div[1]/div/div[1]/input
    # //*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input # 可以work

    try:
        browser.find_by_xpath(
            '//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input').fill(
                searchword)
        timeSleepOne()
    except AttributeError as e:
        browser.find_by_xpath(
            '//*[@id="tsf"]/div[2]/div/div[1]/div/div[1]/input').fill(
                searchword)
        timeSleepOne()
    # enter
    """
    Message: unknown error: Element <input class="gNO89b" value="Google 搜尋" aria-label="Google 搜尋" name="btnK" type="submit" data-ved="0ahUKEwj79oGLmIPpAhULGqYKHRcCBy8Q4dUDCAk"> is not clickable at point (445, 567). Other element would receive the click: <div class="fbar">...</div>
    (Session info: headless chrome=80.0.3987.122)
    (Driver info: chromedriver=2.36.540471 (9c759b81a907e70363c6312294d30b6ccccc2752),platform=Linux 4.15.0-65-generic x86_64)
    """
    browser.driver.set_window_size(1920, 1080)
    browser.find_by_xpath(
        '//*[@id="tsf"]/div[2]/div/div[2]/div[2]/div/center/input[1]').click()
    # browser.find_by_value("Google 搜尋").click()

    timeSleepRandomly()
Пример #8
0
def findOutNws(browser, topTabList):
    findOutNws = {}
    findOutNws['1'] = "全部"
    for n in topTabList:
        broTmp = browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{n}]/a')
        if broTmp.text == "新聞":
            print(f"第 {n} 是新聞")
        findOutNws[str(n)] = broTmp.text
        timeSleepOne()
    return findOutNws
Пример #9
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = ([
                    row for row in soup.select_one(".newsdetail_content").find(
                        "div", {
                            "class": "contxt margin_b20"
                        }).find("div", {
                            "id": "news_detail_div"
                        }).stripped_strings
                ])
                #內文影片
                if soup.select_one(".newsdetail_content").find(
                        "div", {
                            "class": "contxt margin_b20"
                        }).find("iframe", {"class": "video"}):
                    linkInContent = soup.select_one(
                        ".newsdetail_content").find(
                            "div", {
                                "class": "contxt margin_b20"
                            }).find("iframe", {
                                "class": "video"
                            }).attrs.get("src")
                    videoID = urlParseDealing.urlParsePath(
                        linkInContent
                    ).split("/")[
                        -1]  #videoID = link.split("/embed/")[1].split("?")[0]
                    videoLinkInContent = f"https://youtube.com/watch?v={videoID}"
                    print("TVBS 發現內文有影片:", videoLinkInContent)

                else:
                    videoLinkInContent = None

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #10
0
def humanSimulate(browser, topTabList):
    randomNum = random.choice(topTabList)
    print(randomNum,"================")
    try:
        browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{randomNum}]/a').mouse_over()
        timeSleepRandomly()
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        timeSleepOne()
        browser.execute_script('window.scrollTo(0,0);')

        if browser.is_element_present_by_xpath('//*[@id="logo"]/img'):
            browser.find_by_xpath('//*[@id="logo"]/img').mouse_over()
        elif browser.is_element_present_by_xpath('//*[@id="logocont"]/a/img'):
            browser.find_by_xpath('//*[@id="logocont"]/a/img').mouse_over()
    except AttributeError as e: # 找不到element 來mouse_over() ; //*[@id="logocont"]/a/img      //*[@id="logo"]/img    左上叫的google有兩種logo位置
        print("擬人化操作找不到 Element。", e)
        pass
Пример #11
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')

                pass

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()

        return
Пример #12
0
def overviewUriDistributor(input, output, keywordUrlPair, headers, dirRoute,
                           objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        url = keywordUrlPair[searchword]
        totalPage = getPageFirst(url + "1", headers)

        print('overviewUriDistributor is in new process %s, %s ' %
              (overviewUriDistributor_proc, os.getpid()))
        print('------接下來要發送 ' + searchword + ' 的overviewUri---------', '共',
              totalPage, '頁')

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder,
                     objective,
                     searchword,
                     keyword="overview")
        mkdirForRawData(objectiveFolder,
                        objective,
                        searchword,
                        keyword="overview")

        for page in range(1, int(totalPage) + 1):
            correctUrl = url + str(page)

            readyTxtFileRoute = dirRoute + f"{searchword}/overview/{page}_{totalPage}_{searchword}.txt"
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + correctUrl + "+" + readyTxtFileRoute

            output.put(consecutiveData)
        print(
            f'這裡是 overviewUriDistributor_{thisPID},準備送給  getPageInARow  處理 {totalPage} 頁的 overviewUri'
        )
        print()

        end = timeCalculate()
        print('overviewUriDistributor 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
Пример #13
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()

                if "https://ent.ltn.com.tw/news/" in url:
                    videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT(
                        url, headers)
                    break

                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(
                    res.text, 'lxml'
                )  # html.parser不夠力 https://ec.ltn.com.tw/article/paper/1295417 抓不到內容
                try:
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.select_one(".text").select("p")
                        if row.text != ""
                    ]
                    videoLinkInContent = None  # 內文本身沒有影片
                except AttributeError as e:
                    # https://news.ltn.com.tw/news/consumer/paper/1284005  --> https://ent.ltn.com.tw/news/paper/1284005
                    print("error code:", e, url)
                    videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT(
                        url, headers)
                break

            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #14
0
def distributeMonthAvailable(input, output, _weatherRecordAvailable,
                             objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        year = input.get()
        monthsAvailable = _weatherRecordAvailable[year]

        eraseRawData(objectiveFolder, objective, year)
        mkdirForRawData(objectiveFolder, objective, year)

        for month in monthsAvailable:
            consecutiveData = year + "+" + month
            output.put(consecutiveData)
            print(
                f'這裡是distributeMonthAvailable,準備送給  getPageInARow  處理: {year}年_{month}月 '
            )
        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_distributeMonthAvailable 累計耗時:{end-begin} 秒')
        timeSleepOne()
Пример #15
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                try:
                    newsContent = soup.find("article", {
                        "itemprop": "articleBody"
                    }).text.strip().split(" ")
                except AttributeError as e:
                    # url = "https://tw.news.yahoo.com/video/%E7%AF%80%E8%83%BD%E5%AE%B6%E9%9B%BB%E8%A3%9C%E5%8A%A9%E5%86%8D%E5%8A%A0%E7%A2%BC-%E8%B2%A8%E7%89%A9%E7%A8%85%E6%B8%9B%E5%85%8D%E9%96%8B%E8%B7%91-053307068.html"
                    # print("error code:", e, url)
                    try:
                        newsContent = soup.find("article").text.strip().split(
                            " ")
                    except AttributeError as e:
                        # "https://tw.news.yahoo.com/%E9%BB%83%E9%87%91%E9%80%B1%E5%A4%A7%E5%90%8C3c%E9%85%AC%E8%B3%93%E7%9B%9B%E5%85%B8-%E6%B6%BC%E5%A4%8F%E6%9C%80%E5%BC%B7%E6%AA%94-081101070.html": [
                        # "黃金週大同3C酬賓盛典涼夏最強檔",
                        print("error code:", "這則新聞爆炸了!", url)
                        newsContent = None

                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #16
0
def humanSimulate(browser, topTabList):
    # 2020/03/19發現,點擊『圖片』後,chrome的語系從中文變成英文,導致xpath變化。
    # //*[@id="yDmH0d"]/div[2]/c-wiz/div[1]/div/div[1]/div[1]/div/div/a[2]
    # //*[@id="yDmH0d"]/div[2]/c-wiz/div[1]/div/div[1]/div[1]/div/div/a[3]
    # //*[@id="yDmH0d"]/div[2]/c-wiz/div[1]/div/div[1]/div[1]/div/div/a[4]
    # AttributeError: 'ElementList' object has no attribute 'click'
    randomNum = random.choice(topTabList)
    print("對 topTabList 第", randomNum, "項,做擬人================")
    try:
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{randomNum}]/a').mouse_over()
        timeSleepRandomly()
        browser.execute_script(
            'window.scrollTo(0, document.body.scrollHeight);')
        timeSleepOne()
        browser.execute_script('window.scrollTo(0,0);')

        if browser.is_element_present_by_xpath('//*[@id="logo"]/img'):
            browser.find_by_xpath('//*[@id="logo"]/img').mouse_over()
        elif browser.is_element_present_by_xpath('//*[@id="logocont"]/a/img'):
            browser.find_by_xpath('//*[@id="logocont"]/a/img').mouse_over()
    except AttributeError as e:  # 找不到element 來mouse_over() ; //*[@id="logocont"]/a/img      //*[@id="logo"]/img    左上r角的google有兩種logo位置
        print("擬人化操作找不到 Element。", e)
        pass
Пример #17
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Пример #18
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective, domainUrl):
    """
    "Id": "6631009",
      "name": "",
      "originprice": "NaN",
      "pics": "https://img1.momoshop.com.tw/goodsimg/0006/631/009/6631009_L.jpg?t=000",
      "picb": "None",
      "produrl": "https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=6631009&Area=search&mdiv=403&oid=55_8&cid=index&kw=%E5%86%B7%E6%9A%96%E7%A9%BA%E8%AA%BF"
    },
    
    有的商品有連結,但是價格與品名不全,要處理。

    {
      "Id": "6574471",
      "name": "【MITSUBISHI 三菱】16公升一級能效強力型除濕機(MJ-E160HN)",
      "originprice": "NaN",
      "pics": "https://img1.momoshop.com.tw/goodsimg/0006/574/471/6574471_L.jpg?t=000",
      "picb": "None",
      "produrl": "https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=6574471&Area=search&mdiv=403&oid=3_22&cid=index&kw=%E9%99%A4%E6%BF%95%E6%A9%9F"
    }
    """
    thisPID = os.getpid()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get()
        
        mkdirForCleanData(objectiveFolderClean, objective)

        # '/home/bluevc/2019/iSelect3C/dataMunging/rawData/momo/冷暖空調電熱水瓶'  <---關鍵字累加的問題
        # dirRoute = dirRoute + searchword

        fileRoute = dirRoute + searchword
        
        if not os.listdir(fileRoute):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        momoDict = {}
        productArray= [] 

        for file in initialFileZeroUnderscoreInt(fileRoute):
            # print("start " + file + " ! ")

            with open(fileRoute + "/" + file)as f:
                inn = f.read()

            # 處理soup=""的情況
            if not inn:
                continue
            textSoup = BeautifulSoup(inn,'html.parser')
            try:
                #一頁至多有30項
                products = textSoup.select_one('.listArea').select_one('ul').select('li')
                for item in products:
                    innerDict = {}
                    innerDict['Id'] = item.attrs.get('gcode')

                    productName = item.select_one('.goodsUrl').select_one('.prdName').text
                    originprice = item.select_one('.goodsUrl').select_one('.money .price').text.replace('$','').replace(',','')
                    if productName:
                        innerDict['name'] = productName

                        if originprice in ("NaN", "熱銷一空"):
                            innerDict['originprice'] = "0"  #"NaN"
                        else:
                            innerDict['originprice'] = originprice
                            
                    else:
                        innerDict['name'] = "品名抓不到"
                        innerDict['originprice'] = "0"  #"NaN"

                    innerDict['pics'] = item.select_one('.goodsUrl img').attrs.get('src') 
                    innerDict['picb'] = "None"
                    innerDict['produrl'] = domainUrl + item.select_one('.goodsUrl').attrs.get('href')
                    productArray.append(innerDict)
            except Exception as e:
                print(f"{file} 有 {e} 的問題。")


        dateTime = datetime.datetime.now()
        fmt = "%Y-%m-%d-%H-%M"  #"%Y年%m月%d日%H時%M分"
        timeStamp = dateTime.strftime(fmt)

        momoDict['product'] = productArray
        momoDict['keyword'] = searchword
        momoDict["dateTime"] = timeStamp


        print("===========進行去重=============")

        momoDict['product'], setNums = EcommerceDataProcessToSet(momoDict['product'])

        with open(f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/momo_{timeStamp}_{setNums}_{searchword}.json", 'w')as f:
            json.dump(momoDict, f, indent=2, ensure_ascii=False)

        print("===========清洗完成=============")
        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = time.time()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。
Пример #19
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                try:
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.select_one(".ndArticle_margin").select(
                            "p") if row.text != ""
                    ]
                    videoLinkInContent = None  # 內文本身沒有影片
                except AttributeError as e:  # AttributeError: 'NoneType' object has no attribute 'select'
                    soupStr = str(soup)
                    if "<br> \xa0</p>" in soupStr:
                        # "<br> \xa0</p>"  不需要變成 "<br> \\xa0</p>"
                        """
                        sqlalchemy.exc.OperationalError: (pymssql.OperationalError) (8152, b'String or binary data would be truncated.DB-Lib error message 8152, severity 16:\nGeneral SQL Server error: Check messages from the SQL Server\n')
                        [SQL: INSERT INTO selected_news_with_tfidf ([news_title_Id], [series_Id], [publisher_Id], news_content, video_link_in_content) VALUES (%(news_title_Id)s, %(series_Id)s, %(publisher_Id)s, %(news_content)s, %(video_link_in_content)s)]
                        [parameters: {'news_title_Id': '201912252', 'series_Id': UUID('9abd7eae-c361-496c-b10c-ae9fcf7be8bb'), 'publisher_Id': '5', 'news_content': '[\'<p> 今年農曆年節時間較早,家電採購需求較以往提早出現買氣,瞄準年前有汰換家中家電的需求,大同3C福利品特賣會特於12月底開跑,一路至明年1月初,提供消費者年前採購好選擇。<br> <br> 12月26日起至2020年1月8日止,全台各地共舉辦20場大同3C福利品特賣會,大小家電可在此一次 ... 
                        (3925 characters truncated) ... aws.com/ap-ne-1-prod/public/FLCZDN5FBRQBN6E6E3S7RP7IW4.jpg","version":"0.10.3","width":640},{"_id":"IO25XHAIRJE3FCUWV7YTXI66CY","type":"raw_html",\']', 'video_link_in_content': None}]
                        (Background on this error at: http://sqlalche.me/e/e3q8)
                        """

                        # https://tw.appledaily.com/property/20191226/WCUY7RP45D2V45RLRN3RULU2QU/
                        tmpStr = soupStr.split(
                            """<script type="application/javascript">window.Fusion="""
                        )[1].split("Fusion.globalContent=")[1].split(
                            '"content":"')[1].split("<br> \xa0</p>")[0]
                        newsContent = [
                            row for row in BeautifulSoup(
                                tmpStr, "html.parser").text.split(" ")
                            if row != ""
                        ]
                    else:
                        # https://tw.appledaily.com/gadget/20190927/IFU7ML7HXNAL2GHDNKOZULDNOU/
                        tmpStr = soupStr.split(
                            """<script type="application/javascript">window.Fusion="""
                        )[1].split("Fusion.globalContent=")[1].split(
                            '"content":"')[1].split("更多「")[0]
                        newsContent = [
                            row for row in tmpStr.split("<br />&nbsp;<br />")
                            if row != ""
                        ]

                        if len("".join(newsContent)) >= 3500:
                            # elif '<br />&nbsp;"' in soupStr:
                            # https://tw.appledaily.com/gadget/20191029/KSU3NPGRYURXTCI3COIUE6KMNM/
                            print(
                                f"appledaily news content exceeds 3500: {url}")
                            tmpStr = soupStr.split(
                                """<script type="application/javascript">window.Fusion="""
                            )[1].split("Fusion.globalContent=")[1].split(
                                '"content":"')[1].split('<br />&nbsp;"}')[0]
                            newsContent = [
                                row
                                for row in tmpStr.split("<br />&nbsp;<br />")
                                if row != ""
                            ]

                    videoLinkInContent = None  # 內文本身沒有影片

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #20
0
def getPageInARow(input, output, folderWorker, momoMallBrowser):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        folderWorker.eraseRawData(searchword)
        folderWorker.mkdirForRawData(searchword)

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(4):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」,頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARow_proc}  {searchword} 第1頁 點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARow_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'lxml')
                print(
                    f"-----------------讀取{searchword}_{buyingTendency}第 1 頁-----------------成功!"
                )

                try:
                    ## current page and total page '頁數5/286'

                    pageState = browser.find_by_xpath(
                        '//*[@id="bt_2_layout_Content"]/div[2]/dl/dt/span')
                    totalPage = int(pageState.text.split('/')[1])
                    currentPage = int(
                        numsHandler.searchFloatNums(
                            pageState.text.split('/')[0]))
                    print(
                        f"-----------------讀取{searchword}_{buyingTendency} 總頁數-----------------成功!"
                    )
                except AttributeError as e:
                    print(f"getPageInARow __{searchword}__出錯", e, "重抓一次!")
                    # 讓程式強制停下來 # 觀察下來,「raise」只會讓當前執行的process停下來,並不會讓「整體」process停下來。
                    # 因此不適合用「raise」。
                    # raise
                    currentPage = 1  # 自訂
                    totalPage = 3  # 自訂
                    continue
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            except StaleElementReferenceException as e:
                print(
                    "----------------StaleElementReferenceException----------------"
                )
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"1_{totalPage}_{searchword}.txt", soup)

        print(f'成功寫出  {searchword}  第 {currentPage} 頁')

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')

        browserQuit(browser, thisPID, getPageInARow_proc)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(totalPage)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            # print()

        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Пример #21
0
def detailPageInARow(input, headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")

        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")

        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()
                soup = BeautifulSoup(res.text, 'html.parser')
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                soup = ""

        judgeSoup(soup, searchword, url, txtFileRoute)
        # if not soup:
        #   badRequestRoute = "/".join(txtFileRoute.split("/")[:-3]) + "/badRequest"
        #   with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a",  newline='', encoding='utf-8')as f: # newline沒作用...
        #       errorMessage = url + "\n"
        #       f.write(errorMessage)   #writelines作用在errorMessage是list時
        # elif soup.select_one('head').text.strip() == 'Service Unavailable':
        #   """

        #   「
        #   <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">

        #   <html><head><title>Service Unavailable</title>
        #   <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/></head>
        #   <body><h2>Service Unavailable</h2>
        #   <hr/><p>HTTP Error 503. The service is unavailable.</p>
        #   </body></html>
        #   」

        #   """
        #   soup = ""

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))

        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(
            f"{thisPID}__成功寫出  {searchword}  detail頁, 第 {productIndex} 項, 共 {productNums} 項。"
        )

        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
Пример #22
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text)) for
                    row in soup.select_one("#story_body_content").select("p")
                    if row.text != ""
                ]
                videoLinkInContent = None  # 內文本身沒有影片
                break
            except AttributeError as e:

                try:
                    # 20200207 udn網頁改版
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.find("article", {
                            "class": "article-content"
                        }).find_all("p") if row.text != ""
                    ]
                except AttributeError as e:
                    # 網頁拜訪若是404,html長的如下樣子。
                    '''
                    response404 = """<html>
                                <head>
                                <script>
                                                        var d = new Date();
                                                        d.setTime(d.getTime() + (300*1000));
                                                        var expires = "expires="+ d.toUTCString();
                                                        document.cookie = "burl=my-test-page01;" + expires + ";path=/";
                                                </script>
                                <!-- Google Tag Manager -->
                                <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
                                                new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
                                                j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
                                                'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
                                                })(window,document,'script','dataLayer','GTM-5CMHR66');</script>
                                <!-- End Google Tag Manager --><script>
                                                (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
                                                (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
                                                m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
                                                })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
                                                        </script>
                                <!-- #Location: /inc/meta/trace_ga -->
                                </head>
                                <body>
                                <!-- Google Tag Manager (noscript) -->
                                <noscript><iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-5CMHR66" style="display:none;visibility:hidden" width="0"></iframe></noscript>
                                <!-- End Google Tag Manager (noscript) -->
                                <script>
                                                window.location="/news/e404?nver";
                                        </script>
                                </body>
                                </html>"""
                    
                    '''

                    if searchWordTrueOrFalse(
                            "404",
                            str(soup.select_one("body").select_one("script"))
                    ):  #'<script>\n                window.location="/news/e404?nver";\n        </script>'
                        # https://udn.com/news/story/7238/3600804
                        print(url, "發生問題:404!")
                        newsContent = "404_None"
                    else:
                        # 不知名情況查看
                        print(soup)
                        newsContent = "404_None"
                        raise

                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #23
0
        def browserClickPageNumber(cls, browser, currentPage, totalPage,
                                   searchword):
            """
                        #點擊頁數
                        # 預設
                        # 1 2 3 4 5 6 7 8 9 10 >> >|
                        # 頂 上10頁
                        # 1                                      14
                        # |< << 11 12 13 14 15 16 17 18 19 20 >> >|

                        # accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[8]/a')
                        accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a')
                        accuratePage.text
                        """

            currentPageNum = int(currentPage)
            if currentPageNum <= 10:
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).text

            elif 11 <= currentPageNum <= 20:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()
                clickNum = currentPageNum - 10 + 2
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text
            else:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()

                clickNextTimes = currentPageNum // 10
                #點擊到正確頁數的畫面
                for i in range(clickNextTimes - 1):
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[13]/a'
                    ).click()
                    timeSleepRandomly()
                    timeSleepOne()

                #點擊到正確頁碼
                judgeNum = currentPageNum - (clickNextTimes * 10)
                if judgeNum:
                    clickNum = judgeNum + 2
                elif judgeNum == 0:
                    clickNum = judgeNum + 12
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text

            print(
                f"{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}"
            )
Пример #24
0
    def humanSimulate(cls, browser):
        """
        之所以click()不成功是因為mouse_over後,browser視窗看不到要點擊的xpath了!
        
        WebDriverException: Message: unknown error: Element <a class="selected">...</a> is not clickable at point (338, 13). Other element would receive the click: <div id="bt_0_002_01" class="">...</div>
        (Session info: chrome=80.0.3987.122)
        (Driver info: chromedriver=2.36.540471 (9c759b81a907e70363c6312294d30b6ccccc2752),platform=Linux 4.15.0-65-generic x86_64)
        
        但依舊可以用boolean的方式判斷;不過視窗的移動,不影響mouse_over()
        if browser.is_element_present_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a'):
            print(1)  
        """
        searchTypeList = [row for row in range(1, 5)]
        pageList = [row for row in range(1, 13)]  # 頁碼欄的第一頁只有12項
        brandAndClassList = [row for row in range(2)]

        randomTypeNum = random.choice(searchTypeList)
        randomPageNum = random.choice(pageList)
        randomBrandClassNum = random.choice(brandAndClassList)

        try:
            try:
                # 針對頁碼,最多12項;在置底頁時,選項不會12項足項
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{randomPageNum}]/a'
                ).mouse_over()
                browserWaitTime(browser)
            except AttributeError as e:  # 找不到element 來mouse_over() ;
                print("頁碼不足12項___擬人化操作找不到 Element。", e)
                browserWaitTime(browser)

            browser.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            timeSleepOne()
            # 針對準確度...價格等4項
            browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[3]/span/ul/li[{randomTypeNum}]'
            ).mouse_over()
            timeSleepOne()

            #針對商標與商品分類選單
            if randomBrandClassNum:
                if browser.is_element_present_by_xpath(
                        '//*[@id="categoriesBtn"]'):
                    browser.find_by_xpath(
                        '//*[@id="categoriesBtn"]').mouse_over()
                elif browser.is_element_present_by_xpath(
                        '//*[@id="bt_0_layout_b203"]'):
                    browser.find_by_xpath(
                        '//*[@id="bt_0_layout_b203"]').mouse_over()
            else:
                if browser.is_element_present_by_xpath(
                        '//*[@id="bt_0_layout_b203"]'):
                    browser.find_by_xpath(
                        '//*[@id="bt_0_layout_b203"]').mouse_over()
                elif browser.is_element_present_by_xpath(
                        '//*[@id="categoriesBtn"]'):
                    browser.find_by_xpath(
                        '//*[@id="categoriesBtn"]').mouse_over()

            timeSleepOne()
            browser.execute_script('window.scrollTo(0,0);')

        except AttributeError as e:  # 找不到element 來mouse_over() ;
            print("擬人化操作找不到 Element。", e)
Пример #25
0
    def browserClickPageNumber(cls, browser, currentPage, totalPage,
                               searchword):
        """
        #點擊頁數
        # 預設
        # 1 2 3 4 5 6 7 8 9 10 >> >|
        # 頂 上10頁
        # 1                                      14
        # |< << 11 12 13 14 15 16 17 18 19 20 >> >|
        
        # 置底
        # |< << 281 282 283 284

        # accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[8]/a')
        accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a')
        accuratePage.text
        """

        currentPageNum = int(currentPage)
        totalPageNum = int(totalPage)
        halfTotalPageNum = totalPageNum // 2

        if currentPageNum > halfTotalPageNum and currentPageNum > 10:
            #去到置底頁
            browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[12]/a').click()
            timeSleepOne()

            if currentPageNum != totalPageNum and currentPageNum // 10 == totalPageNum // 10:
                if currentPageNum % 10 != 0:
                    # 13、18
                    clickBeforeTimes = 0
                elif currentPageNum % 10 == 0:
                    # 290、299
                    clickBeforeTimes = 1

                #反方向點擊到正確頁數的畫面
                for i in range(clickBeforeTimes):
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[2]/a'
                    ).click()
                    # timeSleepRandomly()
                    # timeSleepOne()
                    browserWaitTime(browser)

            elif currentPageNum != totalPageNum and currentPageNum // 10 < totalPageNum // 10:

                if currentPageNum % 10 != 0 and totalPageNum % 10 == 0:  # and totalPageNum - currentPageNum < 10:
                    # 281、290
                    # 271、290
                    # 11、30
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10) - 1

                elif currentPageNum % 10 != 0 and totalPageNum % 10 != 0:  # and totalPageNum - currentPageNum >= 10:
                    # 271、291
                    # 18、23
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10)

                elif currentPageNum % 10 == 0 and totalPageNum % 10 != 0:
                    # 270、291
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10) + 1

                elif currentPageNum % 10 == 0 and totalPageNum % 10 == 0:
                    # 270、290
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10)

                #反方向點擊到正確頁數的畫面
                for i in range(clickBeforeTimes):
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[2]/a'
                    ).click()
                    # timeSleepRandomly()
                    # timeSleepOne()
                    browserWaitTime(browser)

            #點擊到正確頁碼
            judgeNum = currentPageNum % 10
            if judgeNum:
                clickNum = judgeNum + 2
            elif judgeNum == 0:
                clickNum = judgeNum + 12
            print(
                f"反方向__{searchword}__目標頁碼:{currentPage}, 點擊項次:{clickNum}, 總頁數:{totalPage}"
            )
            browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
            ).click()
            accuratePage = browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
            ).text

            print(
                f"反方向__{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}"
            )

        else:
            if currentPageNum <= 10:
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).text

            elif 11 <= currentPageNum <= 20:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()
                clickNum = currentPageNum - 10 + 2
                timeSleepOne()
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text
            else:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()

                if currentPageNum % 10 == 0:
                    # 電冰箱__目標頁碼:290, 點擊頁碼:300, 總頁數:921
                    clickNextTimes = currentPageNum // 10 - 1
                else:
                    # 冰箱__目標頁碼:292, 點擊頁碼:292, 總頁數:921
                    clickNextTimes = currentPageNum // 10

                #點擊到正確頁數的畫面
                for i in range(clickNextTimes - 1):  #扣1是因為已經「#去到11~20頁」
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[13]/a'
                    ).click()
                    # timeSleepRandomly()
                    # timeSleepOne()
                    browserWaitTime(browser)

                #點擊到正確頁碼
                judgeNum = currentPageNum - (clickNextTimes * 10)
                if judgeNum:
                    clickNum = judgeNum + 2
                elif judgeNum == 0:
                    clickNum = judgeNum + 12
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text

            print(
                f"{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}"
            )
Пример #26
0
def distributeKeyword(keywordUrlPair, output):
    for keyword in keywordUrlPair:
        print('distributeKeyword in main process %s' % os.getpid())
        output.put(keyword)
        print("這裡是distributeKeyword,準備送給  接下來的進程  處理: " + keyword)
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
Пример #27
0
def dataMunging(input, output, dirRoute,objectiveFolder, objective, domainUrl, *args):
    thisPID = os.getpid()
    energyLabelUrl = "https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/"
    bureauReplace = bureauEnergyReplace()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get() 
        dirNameAccepted = dirRoute + f"{searchword}/overview/"
        dirNameWriteOut = dirRoute + f"{searchword}/"

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameAccepted + '」---------')
        print()
        
        if not os.listdir(dirNameAccepted):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        bureauEnergyDict = {}
        productArray= [] 
        
        for file in initialFileZeroUnderscoreInt(dirNameAccepted):
            # print(" start " + file + " ! ")
                
            with open(dirNameAccepted + file)as f:
                inn = f.read()
            
            # 處理soup=""的情況
            if not inn:
              continue
            
            textSoup = BeautifulSoup(inn,'html.parser')

            a = 0
            b = 7

            for i in range(10): #每頁有十項,每7個元素一組
                oneset = textSoup.find_all('div',{'class':'col-md-12 column'})[-1].find_all('td',{'align':'left'})[a:b]
                if oneset != []:
                    
                    detailUrl =  domainUrl + oneset[2].a.attrs.get('href')
                    
                    parseUrl = urlparse(detailUrl)
                    qsDict = parse_qs(parseUrl.query)
                    p1 = qsDict['id'].pop() #id是p1
                    p0 = qsDict['p0'].pop()
                    
                    productDict = {}
                    productDict['Id'] = p1 #oneset[2].a.attrs.get('href').split('id=')[1]
                    #  檔案裡面有髒值  冰箱"product_model": "B23KV-81RE\n", "IB 7030 F TW"     空調"product_model": "PAX-K500CLD ",
                    productDict['product_model'] = bureauReplace.productModel(oneset[0].text)
                    productDict['brand_name'] = oneset[1].text
                    productDict['login_number'] = oneset[2].text
                    productDict['detailUri'] = detailUrl
                    productDict['labeling_company'] = oneset[3].text
                    productDict['efficiency_rating'] = oneset[4].text
                    productDict['from_date_of_expiration'] = bureauReplace.date(oneset[5].text)
                    
                    # 我們可以組裝outerUri
                    # https://ranking.energylabel.org.tw/product/Approval/file_list.aspx?p1=20901&p0=82409
                    productDict['energy_efficiency_label_outerUri'] = f"{domainUrl}file_list.aspx?p1={p1}&p0={p0}"
                    
                    # 我們想要的InnerUri
                    # https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/20901/SB_photo1/EF2R-13DEX1.jpg
                    # productDict['energy_efficiency_label_innerUri'] = ... 因為這邊要做判斷,因此在 「bureauEnergyMunging.py」再處理,以不影響爬蟲的進度。


                    productArray.append(productDict)

                    a += 7
                    b += 7
                    # print('done ' + file + ' 的第' + str(i+1) + '份。')
                else:
                    print('在 ' + file + ' 的第' + str(i+1) + '處,發現空值!')
                    break
            
        bureauEnergyDict['product'] = productArray
        bureauEnergyDict['keyword'] = searchword
        timeStamp = timeStampGenerator()
        bureauEnergyDict["dateTime"] = timeStamp

        totalNums = len(bureauEnergyDict['product'])
        
        with open(dirNameWriteOut + f"jsonIntegration/{objective}_overview_{timeStamp}_{totalNums}_{searchword}.json","w",encoding="utf-8")as f:
            json.dump(bureauEnergyDict, f, indent=2, ensure_ascii=False)
        
        print(f'這裡是 dataMunging ,處理{searchword}完成: ' + dirNameWriteOut + "jsonIntegration/")


        # ========= 如果只想要洗 overview html,此區可以註解掉。==========
        # 莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="detail")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="detail")
        
        productIndex = 1
        for file in bureauEnergyDict['product']:
            detailUri = file['detailUri']
            readyTxtFileRoute = dirNameWriteOut + f"detail/{productIndex}_{totalNums}_{searchword}.txt"
            
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + detailUri + "+" + readyTxtFileRoute

            output.put(consecutiveData)
            # print('這裡是 dataMunging,準備送給  detailPageInARow  處理: ' + consecutiveData)
            # print()            
            productIndex += 1
        # ========= ================================



        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
        timeSleepOne() #暫停幾秒來模擬現實狀況。
Пример #28
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        for i in range(4):
            if i <= 2:
                try:
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    timeSleepOne()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    timeSleepRandomly()
                    timeSleepTwo()
                    soup = ""
            else:
                try:
                    timeSleepEight()
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    soup = ""

        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, correctUrl, txtFileRoute) == "check":
            soup = ""

        # 原來
        # timeSleepOne()
        # timeSleepRandomly()
        # res = requests.get(correctUrl, headers=headers)
        # res.encoding = 'utf-8'
        # timeSleepRandomly()
        # soup  = BeautifulSoup(res.text,'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        # print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        # print('getPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
Пример #29
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective):
    # begin = timeCalculate()
    thisPID = os.getpid()
    bureauMunging = bureauEnergyMunging()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get()

        dirNameCheck = dirRoute + f"{searchword}/"
        directory = dirRoute + f"{searchword}/detail/"
        dirNameWriteOut = dirRoute + f"{searchword}/jsonIntegration/"

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameWriteOut  + '」---------')
        print()


        mkdirForCleanData(objectiveFolderClean, objective)

        if not os.listdir(dirNameCheck):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        # 此區已經採用簡化的寫法,因此若洗資料都無問題,那麼就可以刪除了。
        # if searchword == "除濕機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailDehumidification(searchword, directory)
        # elif searchword == "無風管空氣調節機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailAirConditioner(searchword, directory)
        # elif searchword == "電冰箱":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailRefrigerator(searchword, directory)
        # elif searchword == "電熱水瓶":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailElectricWarmer(searchword, directory)
        # elif searchword == "溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDrinkMachine(searchword, directory)
        # elif searchword == "溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDispenser(searchword, directory)
        # elif searchword == "冰溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDrinkMachine(searchword, directory)
        # elif searchword == "冰溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDispenser(searchword, directory)
        # elif searchword == "貯備型電熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailStorageWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯爐":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasStove(searchword, directory)
        # elif searchword == "安定器內藏式螢光燈泡":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailCompactFluorescentLamp(searchword, directory)

        # '無風管空氣調節機', '除濕機', '電冰箱', '電熱水瓶', '溫熱型開飲機',
        # '溫熱型飲水機', '冰溫熱型開飲機', '冰溫熱型飲水機', '貯備型電熱水器' , '瓦斯熱水器', '瓦斯爐', '安定器內藏式螢光燈泡'
        bureauEnergyDetail, totalNums = bureauMunging.detailMungingEntry(searchword, directory)

        with open(dirNameWriteOut + f"{objective}_detail_{timeStampGenerator()}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyDetail, f, indent=2, ensure_ascii=False)

        # 找出 overviewJsonFile ,開始與detailJsonFile合併:
        overviewJsonFile = [overviewFile for overviewFile in os.listdir(dirNameWriteOut) if "bureauEnergy_overview" in overviewFile].pop()
        with open(dirNameWriteOut + overviewJsonFile)as f:
            bureauEnergyOverview = json.load(f)

        modelPool = [comparedValue['product_model'] for comparedValue in bureauEnergyDetail['productDetail']]
        modelPoolDict = { v: k  for k, v in enumerate(modelPool)}


        #打開overviewJson檔案,為每個產品增加欄位。  
        for jsonObject in bureauEnergyOverview['product']:
            index, test_report_of_energy_efficiency, benchmark, annual, labelUri = zipJsonObject(modelPoolDict, jsonObject['product_model'], bureauEnergyDetail)
            
            # print('正在處理索引值: '+str(index))
            jsonObject['test_report_of_energy_efficiency'] = test_report_of_energy_efficiency
            jsonObject['efficiency_benchmark'] = benchmark
            jsonObject['annual_power_consumption_degrees_dive_year'] = annual
            jsonObject['energy_efficiency_label_innerUri'] = labelUri
            # print('done '+str(index))

        # 新增欄位的Json檔案更新時間。
        timeStamp = timeStampGenerator()
        bureauEnergyOverview["dateTime"] = timeStamp
        
        with open(f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/{objective}_{timeStamp}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyOverview, f, indent=2, ensure_ascii=False)

        statistic.append(totalNums)

        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。
Пример #30
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective, domainUrl):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForCleanData(objectiveFolderClean, objective)

        # '/home/bluevc/2019/iSelect3C/dataMunging/rawData/momo/冷暖空調電熱水瓶'  <---關鍵字累加的問題
        # dirRoute = dirRoute + searchword

        fileRoute = dirRoute + searchword

        if not os.listdir(fileRoute):
            print(
                f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。============="
            )
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        momoDict = {}
        productArray = []

        for file in initialFileZeroUnderscoreInt(fileRoute):
            # print("start " + file + " ! ")

            with open(fileRoute + "/" + file) as f:
                inn = f.read()

            # 處理soup=""的情況
            if not inn:
                continue
            textSoup = BeautifulSoup(inn, 'html.parser')
            try:
                #一頁至多有30項
                products = textSoup.select_one('.listArea').select_one(
                    'ul').select('li')
                for item in products:
                    innerDict = {}
                    innerDict['Id'] = item.attrs.get('gcode')
                    innerDict['name'] = item.select_one(
                        '.goodsUrl').select_one('.prdName').text
                    innerDict['originprice'] = item.select_one(
                        '.goodsUrl').select_one('.money .price').text.replace(
                            '$', '').replace(',', '')
                    innerDict['pics'] = item.select_one(
                        '.goodsUrl img').attrs.get('src')
                    innerDict['picb'] = "None"
                    innerDict['produrl'] = domainUrl + item.select_one(
                        '.goodsUrl').attrs.get('href')
                    productArray.append(innerDict)
            except Exception as e:
                print(f"{file} 有 {e} 的問題。")

        dateTime = datetime.datetime.now()
        fmt = "%Y-%m-%d-%H-%M"  #"%Y年%m月%d日%H時%M分"
        timeStamp = dateTime.strftime(fmt)

        momoDict['product'] = productArray
        momoDict['keyword'] = searchword
        momoDict["dateTime"] = timeStamp

        print("===========進行去重=============")

        momoDict['product'], setNums = EcommerceDataProcessToSet(
            momoDict['product'])

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/momo_{timeStamp}_{setNums}_{searchword}.json",
                'w') as f:
            json.dump(momoDict, f, indent=2, ensure_ascii=False)

        print("===========清洗完成=============")
        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = time.time()
        print('dataMunging 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。