Exemplo n.º 1
0
def getPageFirst(searchword, keyword, headers):
    url = "https://ecshweb.pchome.com.tw/search/v3.3/{0}/results?q={1}&page=1&sort=sale/dc".format(keyword, searchword)
    for i in range(3):
        try:
            timeSleepRandomly()
            res = requests.get(url, headers=headers)
            timeSleepRandomly()
            res.encoding = 'utf-8'
            jsonPage = json.loads(res.text)
            totalPage = jsonPage['totalPage']
            totalRows = jsonPage['totalRows']
            timeSleepEight()
            timeSleepRandomly()
            break
        except (JSONDecodeError, ConnectionRefusedError) as e:#拜訪太密集的話,pchome回傳的json檔案格式就不會是正常的格式,因此會發生無法json反序列化的例外。
            print(f"getPageFirst {keyword}  {searchword} 這裡發生錯誤   "+str(e)+"正在處理中。")
            timeSleepEight()
            timeSleepRandomly()
            totalPage = "99999"
            totalRows = "99999"
        except requests.exceptions.ConnectionError as e:
            print(f"getPageFirst {keyword}  {searchword} 這裡發生錯誤   "+str(e)+"正在處理中。")
            timeSleepEight()
            timeSleepRandomly()
            totalPage = "99999"
            totalRows = "99999"
    return totalPage, totalRows
Exemplo n.º 2
0
def requestsHandlingWhenTimeoutOccur(url, browserName):
    timeSleepEight()
    browser = buildSplinterBrowserHeadless(browserName)
    timeSleepRandomly()
    browser.visit(url)
Exemplo n.º 3
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Exemplo n.º 4
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        for i in range(4):
            if i <= 2:
                try:
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    timeSleepOne()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    timeSleepRandomly()
                    timeSleepTwo()
                    soup = ""
            else:
                try:
                    timeSleepEight()
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    soup = ""

        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, correctUrl, txtFileRoute) == "check":
            soup = ""

        # 原來
        # timeSleepOne()
        # timeSleepRandomly()
        # res = requests.get(correctUrl, headers=headers)
        # res.encoding = 'utf-8'
        # timeSleepRandomly()
        # soup  = BeautifulSoup(res.text,'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        # print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        # print('getPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
Exemplo n.º 5
0
def detailPageInARow(input,  headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        
        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")
        
        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(4):
          if i <=2:
            try:
              timeSleepTwo()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(url, "發生問題。", e)
              print()
              timeSleepRandomly()
              timeSleepTwo()
              timeSleepTwo()
              soup = ""
          else:
            try:
              timeSleepEight()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(txtFileRoute, "發生問題。", e)
              print()
              soup = ""
        
        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, url, txtFileRoute) == "check":
          soup = ""
        


        with open(txtFileRoute, 'w', encoding='utf-8')as f:
            f.write(str(soup))
        
        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(f"{thisPID}__成功寫出  {searchword}  detail頁, 第 {productIndex} 項, 共 {productNums} 項。")
            
        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
Exemplo n.º 6
0
def getPageInARow(input, output, folderWorker, momoMallBrowser):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        folderWorker.eraseRawData(searchword)
        folderWorker.mkdirForRawData(searchword)

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(4):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」,頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARow_proc}  {searchword} 第1頁 點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARow_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'lxml')
                print(
                    f"-----------------讀取{searchword}_{buyingTendency}第 1 頁-----------------成功!"
                )

                try:
                    ## current page and total page '頁數5/286'

                    pageState = browser.find_by_xpath(
                        '//*[@id="bt_2_layout_Content"]/div[2]/dl/dt/span')
                    totalPage = int(pageState.text.split('/')[1])
                    currentPage = int(
                        numsHandler.searchFloatNums(
                            pageState.text.split('/')[0]))
                    print(
                        f"-----------------讀取{searchword}_{buyingTendency} 總頁數-----------------成功!"
                    )
                except AttributeError as e:
                    print(f"getPageInARow __{searchword}__出錯", e, "重抓一次!")
                    # 讓程式強制停下來 # 觀察下來,「raise」只會讓當前執行的process停下來,並不會讓「整體」process停下來。
                    # 因此不適合用「raise」。
                    # raise
                    currentPage = 1  # 自訂
                    totalPage = 3  # 自訂
                    continue
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            except StaleElementReferenceException as e:
                print(
                    "----------------StaleElementReferenceException----------------"
                )
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"1_{totalPage}_{searchword}.txt", soup)

        print(f'成功寫出  {searchword}  第 {currentPage} 頁')

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')

        browserQuit(browser, thisPID, getPageInARow_proc)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(totalPage)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            # print()

        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Exemplo n.º 7
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    begin = time.time()
    thisPID = os.getpid()
    while True:
        print(thisPID,"===========================================")
        searchwordAndKeyword = input.get()
        searchword, keyword = searchwordAndKeyword.split("+")

        print("getPageInARow is in new process %s, %s " % (getPageInARow_proc, thisPID))
        eraseRawData(objectiveFolder, objective, searchword, keyword=keyword)
        mkdirForRawData(objectiveFolder, objective, searchword, keyword=keyword)

        totalPage, totalRows = getPageFirst(searchword, keyword, headers)
        try:
            totalPagePlusOne = totalPage+1
        except TypeError as e:
            print("getPageFirst 出錯", e)
            raise


        print(f"關鍵字 {searchword} 從{keyword}上取得資料,一共有 {totalPage} 頁, {totalRows}筆。")

        for page in range(1, totalPagePlusOne):
            url = 'https://ecshweb.pchome.com.tw/search/v3.3/{0}/results?q={1}&page={2}&sort=sale/dc'.format(keyword, searchword, page)
            
            for i in range(4):
                try:
                    timeSleepRandomly()
                    res = requests.get(url, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    jsonPage = json.loads(res.text)
                    timeSleepEight()
                    timeSleepRandomly()
                    break
                except (JSONDecodeError, ConnectionRefusedError) as e:#拜訪太密集的話,pchome回傳的json檔案格式就不會是正常的格式,因此會發生無法json反序列化的例外。
                    print(f"getPageInARow這裡發生錯誤  {keyword}_{searchword}_{page} "+str(e)+"正在處理中。")
                    timeSleepEight()
                    timeSleepRandomly()
                    jsonPage = ""
                except requests.exceptions.ConnectionError as e:
                    print(f"getPageInARow這裡發生錯誤  {keyword}_{searchword}_{page} "+str(e)+"正在處理中。")
                    timeSleepEight()
                    timeSleepRandomly()
                    jsonPage = ""

            if not jsonPage:
                badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
                with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a",  newline='', encoding='utf-8')as f: # newline沒作用...
                    errorMessage = url + "\n"
                    f.write(errorMessage)   #writelines作用在errorMessage是list時
            with open(f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{keyword}/{page}_{totalPage}_{totalRows}_{keyword+searchword}.json", 'w', encoding='utf-8')as f:
                json.dump(jsonPage, f, indent=2, ensure_ascii=False)
            print("成功寫出  {0}  第 {1} 頁,共 {2} 頁".format(keyword+searchword, page, totalPage))

        print(f"這裡是getPageInARow_{thisPID},準備完成{keyword}_{searchword}工作。 ")
        print()
        end = time.time()
        print('getPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。