Пример #1
0
    def intersectionForCrawl(self, ecommerceName, ecommerceLink=None):

        if ecommerceName == self._eCommerceMomoMall:
            browser = buildSplinterBrowserHeadless('chrome')

            return browser
        else:
            return None
Пример #2
0
def requestsHandlingWhenTimeoutOccur(url, browserName):
    timeSleepEight()
    browser = buildSplinterBrowserHeadless(browserName)
    timeSleepRandomly()
    browser.visit(url)
Пример #3
0
def getPageInARowAdvanced(input, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        searchword, page, totalPage, url = consecutiveUrl.split('+')
        # print(url)
        print(
            f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁:"
        )

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepFour()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html
                timeSleepRandomly()

                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(f"讀取{searchword}第 {page} 頁,成功!")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁,成功!")

        if not soup:
            badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
            with open(f"{badRequestRoute}/badRequest_{searchword}.txt",
                      "a",
                      newline='',
                      encoding='utf-8') as f:  # newline沒作用...
                errorMessage = url + "\n"
                f.write(errorMessage)  #writelines作用在errorMessage是list時

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {searchword}  第{page}頁,總共{totalPage} 頁。')

        try:
            browser.quit()
            print(
                f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。")
            print(
                f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')
Пример #4
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Пример #5
0
def getPageInARow(input, url, firstPage, topTabList, elementUrl,
                  objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForRawData(objectiveFolder,
                        objective,
                        "google",
                        keyword=searchword)
        browser = buildSplinterBrowserHeadless("chrome")

        browser.visit(url)
        browserWaitTime(browser)

        searchwordKeyInAndEnter(browser, searchword)
        browser.driver.set_window_size(1024, 768)

        forSureNws = findOutNws(browser, topTabList)
        keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
        # 擬人化mouse_over要排除新聞tab
        topTabList.remove(int(keyNews))

        print(f"點擊 topTabList {keyNews} 去到 新聞頁")
        #點擊新聞tab
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
        timeSleepRandomly()

        newsDict = {}
        newsDictInner = {}
        while True:
            print(f"進行 {searchword} 第", firstPage, "頁")
            elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                              newsDictInner, searchword)
            judgment = judgeNextPage(browser, searchword)
            if judgment:
                print(f"『{searchword}』 仍有下一頁,繼續爬取!")
                firstPage += 1
                pass
            else:
                browser.quit()
                break

        timeStamp = timeStampGenerator()
        newsTotalNum = len(newsDictInner)
        newsDict["dateTime"] = timeStamp
        newsDict["keyword"] = searchword
        newsDict["newsTotalNum"] = newsTotalNum
        newsDict["newsUrl"] = newsDictInner

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json",
                'w',
                encoding='utf-8') as f:
            json.dump(newsDict, f, indent=2, ensure_ascii=False)
        print(
            f'{thisPID}  成功寫出  google_{timeStamp}_{newsTotalNum}_{searchword}.json '
        )

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')