Пример #1
0
def getPageInARaw(input, _headers, objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        year, month = consecutiveUrl.split("+")

        url = f"https://www.cwb.gov.tw/V8/C/C/Statistics/MonthlyData/MOD/{year}_{month}.html"
        res = requests.get(url, headers=_headers)
        timeSleepRandomly()
        res.encoding = "utf-8"

        soup = BeautifulSoup(res.text, 'html.parser')

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{year}/{month}_{year}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {month}_{year}.txt ')

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
        timeSleepOne()
Пример #2
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text))
                    for row in soup.find("article").stripped_strings
                    if row != "" and not "googletag.cmd.push" in row
                    and not "function" in row
                ]
                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #3
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    row for row in soup.select_one(".story").stripped_strings
                ]

                #內文影片
                if soup.p.iframe:  #.attrs.get("src"):
                    videoLinkInContent = soup.p.iframe.attrs.get("src")
                    print("ETtoday 發現內文有影片:", videoLinkInContent)

                else:
                    videoLinkInContent = None

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #4
0
def searchwordKeyInAndEnter(browser, searchword):
    # 輸入匡輸入
    browser.find_by_xpath('//*[@id="tsf"]/div[2]/div/div[1]/div/div[1]/input').fill(searchword)
    timeSleepOne()
    # enter
    browser.find_by_xpath('//*[@id="tsf"]/div[2]/div/div[2]/div[2]/div/center/input[1]').click()
    timeSleepRandomly()
Пример #5
0
def elementUrlExtract(browser, topTabList, elementUrl, newsDictInner):
    try:
        for order in elementUrl:
            broUrl = browser.find_by_xpath(f'//*[@id="rso"]/div/div[{order}]/div/div/h3/a')
            broPublisher = browser.find_by_xpath(f'//*[@id="rso"]/div/div[{order}]/div/div/div[1]/span[1]')
            broDate = browser.find_by_xpath(f'//*[@id="rso"]/div/div[{order}]/div/div/div[1]/span[3]')
            newsUrl = broUrl["href"]
            newsTitle = broUrl.text
            publisher = broPublisher.text
            date = broDate.text

            print(newsUrl)
            print(newsTitle)
            print(publisher)
            print(date)
            
            timeSleepRandomly()
            newsDictInner[newsUrl] = [newsTitle, discardSpace(publisher), timeStampCalculate(date)]

            humanSimulate(browser, topTabList)
            
    except ElementDoesNotExist as e: # 新聞標的不到10項時。
        print("新聞標的不到10項,準備關閉瀏覽器。", e)
        print("成功擷取當前頁的新聞連結。")
        pass
    else:
        print("成功擷取當前頁的新聞連結。")
Пример #6
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text))
                    for row in soup.select_one(".article-body").select("p")
                    if row.text != ""
                ]
                videoLinkInContent = None  # 內文本身沒有影片

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #7
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        timeSleepOne()
        timeSleepRandomly()
        res = requests.get(correctUrl, headers=headers)
        res.encoding = 'utf-8'

        timeSleepRandomly()

        soup = BeautifulSoup(res.text, 'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        print('getPageInARow 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
Пример #8
0
def searchwordKeyInAndEnter(browser, searchword):
    # 輸入匡輸入
    # //*[@id="tsf"]/div[2]/div/div[1]/div/div[1]/input
    # //*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input # 可以work

    try:
        browser.find_by_xpath(
            '//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input').fill(
                searchword)
        timeSleepOne()
    except AttributeError as e:
        browser.find_by_xpath(
            '//*[@id="tsf"]/div[2]/div/div[1]/div/div[1]/input').fill(
                searchword)
        timeSleepOne()
    # enter
    """
    Message: unknown error: Element <input class="gNO89b" value="Google 搜尋" aria-label="Google 搜尋" name="btnK" type="submit" data-ved="0ahUKEwj79oGLmIPpAhULGqYKHRcCBy8Q4dUDCAk"> is not clickable at point (445, 567). Other element would receive the click: <div class="fbar">...</div>
    (Session info: headless chrome=80.0.3987.122)
    (Driver info: chromedriver=2.36.540471 (9c759b81a907e70363c6312294d30b6ccccc2752),platform=Linux 4.15.0-65-generic x86_64)
    """
    browser.driver.set_window_size(1920, 1080)
    browser.find_by_xpath(
        '//*[@id="tsf"]/div[2]/div/div[2]/div[2]/div/center/input[1]').click()
    # browser.find_by_value("Google 搜尋").click()

    timeSleepRandomly()
Пример #9
0
def getPageFirst(searchword, keyword, headers):
    url = "https://ecshweb.pchome.com.tw/search/v3.3/{0}/results?q={1}&page=1&sort=sale/dc".format(keyword, searchword)
    for i in range(3):
        try:
            timeSleepRandomly()
            res = requests.get(url, headers=headers)
            timeSleepRandomly()
            res.encoding = 'utf-8'
            jsonPage = json.loads(res.text)
            totalPage = jsonPage['totalPage']
            totalRows = jsonPage['totalRows']
            timeSleepEight()
            timeSleepRandomly()
            break
        except (JSONDecodeError, ConnectionRefusedError) as e:#拜訪太密集的話,pchome回傳的json檔案格式就不會是正常的格式,因此會發生無法json反序列化的例外。
            print(f"getPageFirst {keyword}  {searchword} 這裡發生錯誤   "+str(e)+"正在處理中。")
            timeSleepEight()
            timeSleepRandomly()
            totalPage = "99999"
            totalRows = "99999"
        except requests.exceptions.ConnectionError as e:
            print(f"getPageFirst {keyword}  {searchword} 這裡發生錯誤   "+str(e)+"正在處理中。")
            timeSleepEight()
            timeSleepRandomly()
            totalPage = "99999"
            totalRows = "99999"
    return totalPage, totalRows
Пример #10
0
def getPageFirst(url, headers):

    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"

    timeSleepRandomly()

    soup = BeautifulSoup(res.text, 'html.parser')
    totalPage = soup.select('.Paging span')[-1].text

    return totalPage
Пример #11
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = ([
                    row for row in soup.select_one(".newsdetail_content").find(
                        "div", {
                            "class": "contxt margin_b20"
                        }).find("div", {
                            "id": "news_detail_div"
                        }).stripped_strings
                ])
                #內文影片
                if soup.select_one(".newsdetail_content").find(
                        "div", {
                            "class": "contxt margin_b20"
                        }).find("iframe", {"class": "video"}):
                    linkInContent = soup.select_one(
                        ".newsdetail_content").find(
                            "div", {
                                "class": "contxt margin_b20"
                            }).find("iframe", {
                                "class": "video"
                            }).attrs.get("src")
                    videoID = urlParseDealing.urlParsePath(
                        linkInContent
                    ).split("/")[
                        -1]  #videoID = link.split("/embed/")[1].split("?")[0]
                    videoLinkInContent = f"https://youtube.com/watch?v={videoID}"
                    print("TVBS 發現內文有影片:", videoLinkInContent)

                else:
                    videoLinkInContent = None

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #12
0
def browserWaitTime(browser):
    browser.wait_time
    timeSleepRandomly()


# scheme of making object
# class bbb(object):
#     def __init__(self,num):
#         self.num = num
#     def buildSplinterBrowser(self, browserName):
#         browser = Browser(driver_name = browserName, headless=False, incognito=True)#, options = chrome_options)
#         return browser

# browser = BBB.buildSplinterBrowser("chrome")

# browser.visit("https://www.google.com/")
Пример #13
0
def humanSimulate(browser, topTabList):
    randomNum = random.choice(topTabList)
    print(randomNum,"================")
    try:
        browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{randomNum}]/a').mouse_over()
        timeSleepRandomly()
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        timeSleepOne()
        browser.execute_script('window.scrollTo(0,0);')

        if browser.is_element_present_by_xpath('//*[@id="logo"]/img'):
            browser.find_by_xpath('//*[@id="logo"]/img').mouse_over()
        elif browser.is_element_present_by_xpath('//*[@id="logocont"]/a/img'):
            browser.find_by_xpath('//*[@id="logocont"]/a/img').mouse_over()
    except AttributeError as e: # 找不到element 來mouse_over() ; //*[@id="logocont"]/a/img      //*[@id="logo"]/img    左上叫的google有兩種logo位置
        print("擬人化操作找不到 Element。", e)
        pass
Пример #14
0
def getPageInARow(url, searchword, firstPage, topTabList, elementUrl):

    browser = buildSplinterBrowser("chrome")
     
    browser.visit(url)
    browserWaitTime(browser)

    searchwordKeyInAndEnter(browser, searchword)
    browser.driver.set_window_size(1024,768)

    forSureNws = findOutNws(browser, topTabList)

    keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
    # 擬人化mouse_over要排除新聞tab
    topTabList.remove(int(keyNews))

    print(f"點擊 {keyNews} 去到 新聞頁")
    #點擊新聞tab
    browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
    timeSleepRandomly()

    newsDict = {}
    newsDictInner = {}
    while True:
        print(f"進行 {searchword} 第", firstPage, "頁")
        elementUrlExtract(browser, topTabList, elementUrl, newsDictInner)
        judgment = judgeNextPage(browser)
        if judgment:
            print("仍有下一頁,繼續爬取!")
            firstPage += 1
            pass
        else:
            browser.quit()
            break

    
    newsDict["dateTime"] = timeStampGenerator()
    newsDict["keyword"] = searchword
    newsDict["newsTotalNum"] = len(newsDictInner)
    newsDict["newsUrl"] = newsDictInner

    return newsDict
Пример #15
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')

                pass

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()

        return
Пример #16
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()

                if "https://ent.ltn.com.tw/news/" in url:
                    videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT(
                        url, headers)
                    break

                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(
                    res.text, 'lxml'
                )  # html.parser不夠力 https://ec.ltn.com.tw/article/paper/1295417 抓不到內容
                try:
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.select_one(".text").select("p")
                        if row.text != ""
                    ]
                    videoLinkInContent = None  # 內文本身沒有影片
                except AttributeError as e:
                    # https://news.ltn.com.tw/news/consumer/paper/1284005  --> https://ent.ltn.com.tw/news/paper/1284005
                    print("error code:", e, url)
                    videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT(
                        url, headers)
                break

            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #17
0
def humanSimulate(browser, topTabList):
    # 2020/03/19發現,點擊『圖片』後,chrome的語系從中文變成英文,導致xpath變化。
    # //*[@id="yDmH0d"]/div[2]/c-wiz/div[1]/div/div[1]/div[1]/div/div/a[2]
    # //*[@id="yDmH0d"]/div[2]/c-wiz/div[1]/div/div[1]/div[1]/div/div/a[3]
    # //*[@id="yDmH0d"]/div[2]/c-wiz/div[1]/div/div[1]/div[1]/div/div/a[4]
    # AttributeError: 'ElementList' object has no attribute 'click'
    randomNum = random.choice(topTabList)
    print("對 topTabList 第", randomNum, "項,做擬人================")
    try:
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{randomNum}]/a').mouse_over()
        timeSleepRandomly()
        browser.execute_script(
            'window.scrollTo(0, document.body.scrollHeight);')
        timeSleepOne()
        browser.execute_script('window.scrollTo(0,0);')

        if browser.is_element_present_by_xpath('//*[@id="logo"]/img'):
            browser.find_by_xpath('//*[@id="logo"]/img').mouse_over()
        elif browser.is_element_present_by_xpath('//*[@id="logocont"]/a/img'):
            browser.find_by_xpath('//*[@id="logocont"]/a/img').mouse_over()
    except AttributeError as e:  # 找不到element 來mouse_over() ; //*[@id="logocont"]/a/img      //*[@id="logo"]/img    左上r角的google有兩種logo位置
        print("擬人化操作找不到 Element。", e)
        pass
Пример #18
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                try:
                    newsContent = soup.find("article", {
                        "itemprop": "articleBody"
                    }).text.strip().split(" ")
                except AttributeError as e:
                    # url = "https://tw.news.yahoo.com/video/%E7%AF%80%E8%83%BD%E5%AE%B6%E9%9B%BB%E8%A3%9C%E5%8A%A9%E5%86%8D%E5%8A%A0%E7%A2%BC-%E8%B2%A8%E7%89%A9%E7%A8%85%E6%B8%9B%E5%85%8D%E9%96%8B%E8%B7%91-053307068.html"
                    # print("error code:", e, url)
                    try:
                        newsContent = soup.find("article").text.strip().split(
                            " ")
                    except AttributeError as e:
                        # "https://tw.news.yahoo.com/%E9%BB%83%E9%87%91%E9%80%B1%E5%A4%A7%E5%90%8C3c%E9%85%AC%E8%B3%93%E7%9B%9B%E5%85%B8-%E6%B6%BC%E5%A4%8F%E6%9C%80%E5%BC%B7%E6%AA%94-081101070.html": [
                        # "黃金週大同3C酬賓盛典涼夏最強檔",
                        print("error code:", "這則新聞爆炸了!", url)
                        newsContent = None

                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #19
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Пример #20
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                try:
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.select_one(".ndArticle_margin").select(
                            "p") if row.text != ""
                    ]
                    videoLinkInContent = None  # 內文本身沒有影片
                except AttributeError as e:  # AttributeError: 'NoneType' object has no attribute 'select'
                    soupStr = str(soup)
                    if "<br> \xa0</p>" in soupStr:
                        # "<br> \xa0</p>"  不需要變成 "<br> \\xa0</p>"
                        """
                        sqlalchemy.exc.OperationalError: (pymssql.OperationalError) (8152, b'String or binary data would be truncated.DB-Lib error message 8152, severity 16:\nGeneral SQL Server error: Check messages from the SQL Server\n')
                        [SQL: INSERT INTO selected_news_with_tfidf ([news_title_Id], [series_Id], [publisher_Id], news_content, video_link_in_content) VALUES (%(news_title_Id)s, %(series_Id)s, %(publisher_Id)s, %(news_content)s, %(video_link_in_content)s)]
                        [parameters: {'news_title_Id': '201912252', 'series_Id': UUID('9abd7eae-c361-496c-b10c-ae9fcf7be8bb'), 'publisher_Id': '5', 'news_content': '[\'<p> 今年農曆年節時間較早,家電採購需求較以往提早出現買氣,瞄準年前有汰換家中家電的需求,大同3C福利品特賣會特於12月底開跑,一路至明年1月初,提供消費者年前採購好選擇。<br> <br> 12月26日起至2020年1月8日止,全台各地共舉辦20場大同3C福利品特賣會,大小家電可在此一次 ... 
                        (3925 characters truncated) ... aws.com/ap-ne-1-prod/public/FLCZDN5FBRQBN6E6E3S7RP7IW4.jpg","version":"0.10.3","width":640},{"_id":"IO25XHAIRJE3FCUWV7YTXI66CY","type":"raw_html",\']', 'video_link_in_content': None}]
                        (Background on this error at: http://sqlalche.me/e/e3q8)
                        """

                        # https://tw.appledaily.com/property/20191226/WCUY7RP45D2V45RLRN3RULU2QU/
                        tmpStr = soupStr.split(
                            """<script type="application/javascript">window.Fusion="""
                        )[1].split("Fusion.globalContent=")[1].split(
                            '"content":"')[1].split("<br> \xa0</p>")[0]
                        newsContent = [
                            row for row in BeautifulSoup(
                                tmpStr, "html.parser").text.split(" ")
                            if row != ""
                        ]
                    else:
                        # https://tw.appledaily.com/gadget/20190927/IFU7ML7HXNAL2GHDNKOZULDNOU/
                        tmpStr = soupStr.split(
                            """<script type="application/javascript">window.Fusion="""
                        )[1].split("Fusion.globalContent=")[1].split(
                            '"content":"')[1].split("更多「")[0]
                        newsContent = [
                            row for row in tmpStr.split("<br />&nbsp;<br />")
                            if row != ""
                        ]

                        if len("".join(newsContent)) >= 3500:
                            # elif '<br />&nbsp;"' in soupStr:
                            # https://tw.appledaily.com/gadget/20191029/KSU3NPGRYURXTCI3COIUE6KMNM/
                            print(
                                f"appledaily news content exceeds 3500: {url}")
                            tmpStr = soupStr.split(
                                """<script type="application/javascript">window.Fusion="""
                            )[1].split("Fusion.globalContent=")[1].split(
                                '"content":"')[1].split('<br />&nbsp;"}')[0]
                            newsContent = [
                                row
                                for row in tmpStr.split("<br />&nbsp;<br />")
                                if row != ""
                            ]

                    videoLinkInContent = None  # 內文本身沒有影片

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #21
0
def browserWaitTime(browser):
    browser.wait_time
    timeSleepRandomly()
Пример #22
0
def browserSetWindowSize(browser, horizon=1024, vertical=768):

    browser.driver.set_window_size(horizon, vertical)
    timeSleepRandomly()
Пример #23
0
    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text)) for
                    row in soup.select_one("#story_body_content").select("p")
                    if row.text != ""
                ]
                videoLinkInContent = None  # 內文本身沒有影片
                break
            except AttributeError as e:

                try:
                    # 20200207 udn網頁改版
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.find("article", {
                            "class": "article-content"
                        }).find_all("p") if row.text != ""
                    ]
                except AttributeError as e:
                    # 網頁拜訪若是404,html長的如下樣子。
                    '''
                    response404 = """<html>
                                <head>
                                <script>
                                                        var d = new Date();
                                                        d.setTime(d.getTime() + (300*1000));
                                                        var expires = "expires="+ d.toUTCString();
                                                        document.cookie = "burl=my-test-page01;" + expires + ";path=/";
                                                </script>
                                <!-- Google Tag Manager -->
                                <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
                                                new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
                                                j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
                                                'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
                                                })(window,document,'script','dataLayer','GTM-5CMHR66');</script>
                                <!-- End Google Tag Manager --><script>
                                                (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
                                                (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
                                                m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
                                                })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
                                                        </script>
                                <!-- #Location: /inc/meta/trace_ga -->
                                </head>
                                <body>
                                <!-- Google Tag Manager (noscript) -->
                                <noscript><iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-5CMHR66" style="display:none;visibility:hidden" width="0"></iframe></noscript>
                                <!-- End Google Tag Manager (noscript) -->
                                <script>
                                                window.location="/news/e404?nver";
                                        </script>
                                </body>
                                </html>"""
                    
                    '''

                    if searchWordTrueOrFalse(
                            "404",
                            str(soup.select_one("body").select_one("script"))
                    ):  #'<script>\n                window.location="/news/e404?nver";\n        </script>'
                        # https://udn.com/news/story/7238/3600804
                        print(url, "發生問題:404!")
                        newsContent = "404_None"
                    else:
                        # 不知名情況查看
                        print(soup)
                        newsContent = "404_None"
                        raise

                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent
Пример #24
0
        def browserClickPageNumber(cls, browser, currentPage, totalPage,
                                   searchword):
            """
                        #點擊頁數
                        # 預設
                        # 1 2 3 4 5 6 7 8 9 10 >> >|
                        # 頂 上10頁
                        # 1                                      14
                        # |< << 11 12 13 14 15 16 17 18 19 20 >> >|

                        # accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[8]/a')
                        accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a')
                        accuratePage.text
                        """

            currentPageNum = int(currentPage)
            if currentPageNum <= 10:
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).text

            elif 11 <= currentPageNum <= 20:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()
                clickNum = currentPageNum - 10 + 2
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text
            else:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()

                clickNextTimes = currentPageNum // 10
                #點擊到正確頁數的畫面
                for i in range(clickNextTimes - 1):
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[13]/a'
                    ).click()
                    timeSleepRandomly()
                    timeSleepOne()

                #點擊到正確頁碼
                judgeNum = currentPageNum - (clickNextTimes * 10)
                if judgeNum:
                    clickNum = judgeNum + 2
                elif judgeNum == 0:
                    clickNum = judgeNum + 12
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text

            print(
                f"{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}"
            )
Пример #25
0
def detailPageInARow(input,  headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        
        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")
        
        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(4):
          if i <=2:
            try:
              timeSleepTwo()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(url, "發生問題。", e)
              print()
              timeSleepRandomly()
              timeSleepTwo()
              timeSleepTwo()
              soup = ""
          else:
            try:
              timeSleepEight()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(txtFileRoute, "發生問題。", e)
              print()
              soup = ""
        
        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, url, txtFileRoute) == "check":
          soup = ""
        


        with open(txtFileRoute, 'w', encoding='utf-8')as f:
            f.write(str(soup))
        
        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(f"{thisPID}__成功寫出  {searchword}  detail頁, 第 {productIndex} 項, 共 {productNums} 項。")
            
        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
Пример #26
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        for i in range(4):
            if i <= 2:
                try:
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    timeSleepOne()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    timeSleepRandomly()
                    timeSleepTwo()
                    soup = ""
            else:
                try:
                    timeSleepEight()
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    soup = ""

        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, correctUrl, txtFileRoute) == "check":
            soup = ""

        # 原來
        # timeSleepOne()
        # timeSleepRandomly()
        # res = requests.get(correctUrl, headers=headers)
        # res.encoding = 'utf-8'
        # timeSleepRandomly()
        # soup  = BeautifulSoup(res.text,'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        # print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        # print('getPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
Пример #27
0
def getPageInARowAdvanced(input, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        searchword, page, totalPage, url = consecutiveUrl.split('+')
        # print(url)
        print(
            f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁:"
        )

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepFour()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html
                timeSleepRandomly()

                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(f"讀取{searchword}第 {page} 頁,成功!")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁,成功!")

        if not soup:
            badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
            with open(f"{badRequestRoute}/badRequest_{searchword}.txt",
                      "a",
                      newline='',
                      encoding='utf-8') as f:  # newline沒作用...
                errorMessage = url + "\n"
                f.write(errorMessage)  #writelines作用在errorMessage是list時

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {searchword}  第{page}頁,總共{totalPage} 頁。')

        try:
            browser.quit()
            print(
                f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。")
            print(
                f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')
Пример #28
0
def getPageInARow(input, url, firstPage, topTabList, elementUrl,
                  objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForRawData(objectiveFolder,
                        objective,
                        "google",
                        keyword=searchword)
        browser = buildSplinterBrowserHeadless("chrome")

        browser.visit(url)
        browserWaitTime(browser)

        searchwordKeyInAndEnter(browser, searchword)
        browser.driver.set_window_size(1024, 768)

        forSureNws = findOutNws(browser, topTabList)
        keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
        # 擬人化mouse_over要排除新聞tab
        topTabList.remove(int(keyNews))

        print(f"點擊 topTabList {keyNews} 去到 新聞頁")
        #點擊新聞tab
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
        timeSleepRandomly()

        newsDict = {}
        newsDictInner = {}
        while True:
            print(f"進行 {searchword} 第", firstPage, "頁")
            elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                              newsDictInner, searchword)
            judgment = judgeNextPage(browser, searchword)
            if judgment:
                print(f"『{searchword}』 仍有下一頁,繼續爬取!")
                firstPage += 1
                pass
            else:
                browser.quit()
                break

        timeStamp = timeStampGenerator()
        newsTotalNum = len(newsDictInner)
        newsDict["dateTime"] = timeStamp
        newsDict["keyword"] = searchword
        newsDict["newsTotalNum"] = newsTotalNum
        newsDict["newsUrl"] = newsDictInner

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json",
                'w',
                encoding='utf-8') as f:
            json.dump(newsDict, f, indent=2, ensure_ascii=False)
        print(
            f'{thisPID}  成功寫出  google_{timeStamp}_{newsTotalNum}_{searchword}.json '
        )

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
Пример #29
0
def requestsHandlingWhenTimeoutOccur(url, browserName):
    timeSleepEight()
    browser = buildSplinterBrowserHeadless(browserName)
    timeSleepRandomly()
    browser.visit(url)
Пример #30
0
def elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                      newsDictInner, searchword):
    try:
        for order in elementUrl:
            # broUrl = browser.find_by_xpath(f'//*[@id="rso"]/div/div[{order}]/div/div/h3/a')
            # broPublisher = browser.find_by_xpath(f'//*[@id="rso"]/div/div[{order}]/div/div/div[1]/span[1]')
            # broDate = browser.find_by_xpath(f'//*[@id="rso"]/div/div[{order}]/div/div/div[1]/span[3]')
            # 2020/03/19變化              //*[@id="rso"]/div[1]/div/div/h3/a
            # 2020/10/02變化  //*[@id="rso"]/div[1]/g-card/div/div/div[2]/a

            # 2020/10/02; 沒有縮圖的新聞物件,標題、出版社、時間的xpath都會變更。
            newsUrl = browser.find_by_xpath(
                f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a')["href"]

            try:
                newsTitle = browser.find_by_xpath(
                    f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a/div/div[2]/div[2]'
                ).text
                publisher = browser.find_by_xpath(
                    f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a/div/div[2]/div[1]'
                ).text
                date = browser.find_by_xpath(
                    f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a/div/div[2]/div[3]/div[2]/span/span/span'
                ).text
            except AttributeError as e:  # 沒有下一頁了!

                #a/div/div[2]  的div[2]拿掉。
                print(
                    f"擷取 『{searchword}』  第 {firstPage} 頁 , 第 {order} 項是沒有縮圖的新聞物件================",
                    e)
                newsTitle = browser.find_by_xpath(
                    f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a/div/div[2]'
                ).text
                publisher = browser.find_by_xpath(
                    f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a/div/div[1]'
                ).text
                date = browser.find_by_xpath(
                    f'//*[@id="rso"]/div[{order}]/g-card/div/div/div[2]/a/div/div[3]/div[2]/span/span/span'
                ).text

            print(
                f"擷取 『{searchword}』  第 {firstPage} 頁 , 第 {order} 項================"
            )
            print(newsUrl)
            print(newsTitle)
            print(publisher)
            print(date)
            date = timeStampCalculate(date)
            print(date)

            timeSleepRandomly()

            publisher = googleNewsRegex.publisherTooLong(
                googleNewsRegex.discardSpace(publisher))

            newsDictInner[newsUrl] = [newsTitle, publisher, date]

            humanSimulate(browser, topTabList)

    except ElementDoesNotExist as e:  # 新聞標的不到10項時。
        print(f"{searchword} 新聞標的不到10項,準備關閉瀏覽器。", e)
        print(f"成功擷取 『{searchword}』  當前頁的新聞連結。")
        pass
    else:
        print(f"成功擷取 『{searchword}』  當前頁的新聞連結。")