Exemplo n.º 1
0
def parseReplys(reply_area, comment, user_id):
    abbrs = reply_area.findAll("abbr")
    for abbr in abbrs:
        tag_withID = abbr.parent.parent.parent
        reply = Entity()
        reply.parent = comment
        reply.postId = toMD5('{}_{}'.format(comment.postId,
                                            tag_withID.get("id")))
        reply.rid = comment.parent.postId
        h3_authorName = tag_withID.find("h3")
        reply.authorName = h3_authorName.getText()
        reply.content = h3_authorName.next_sibling.getText()
        if reply.content.__eq__(""):
            reply.content = "圖片"
        reply.articleDate = speculateArticlePostDate(
            tag_withID.find("abbr").getText())
        react_like_tag = tag_withID.find("abbr").parent.find(
            "span", {'id': re.compile('^like_.*')})
        ufi_url = react_like_tag.find(
            "a", {"href": re.compile('^/ufi/reaction/profile/browser/.*')})
        if ufi_url:
            setReplyAttr(reply, generateMFBUrl(ufi_url.get("href")))
        print('寫入回應 : ', reply.toMap())

    more = reply_area.find("div",
                           {"id": re.compile("^comment_replies_more_1.*")})
    if more:
        more_url = generateMFBUrl(more.a.get("href"))
        resp = sendRequest(more_url)
        soup = BeautifulSoup(resp.text, features='lxml')
        main_comment = soup.find("div", id=user_id)
        reply_area = main_comment.next_sibling
        parseReplys(reply_area, comment, user_id)
Exemplo n.º 2
0
def parseArticle(url):
    resp = sendRequest(url)
    soup = BeautifulSoup(resp.text, features='lxml')
    article = Entity()
    article.url = str(re.search("https://.*refid=17", url).group()[0:-1])
    article.postId = toMD5(url)
    article.rid = article.postId
    authorName = soup.findAll("table", {"role": "presentation"})[2].getText()
    article.articleDate = speculateArticlePostDate(soup.find("abbr").getText())
    if authorName:
        article.authorName = re.sub("查看編輯紀錄", "", authorName)
    else:
        article.authorName = "???"
    article.content = soup.find("title").getText()
    title = re.search("^.*[??!!。~~.]+", article.content)
    if title:
        article.title = title.group()
    else:
        article.title = article.content[0:20]

    replyBar = soup.find("div", {
        "id": "add_comment_switcher_placeholder"
    }).next_sibling.a
    replyBarUrl = generateMFBUrl(replyBar.get("href"))
    setReplyAttr(article, replyBarUrl)
    print(article.toMap())
    article.setAttr("soup", soup)
    return article
Exemplo n.º 3
0
def processCommentAndReply(user_id, replyUrl, comment):
    randomSleep()
    resp = sendRequest(replyUrl)
    soup = BeautifulSoup(resp.text, features='lxml')
    main_comment = soup.find("div", id=user_id)
    h3_authorName = main_comment.find("h3")
    comment.postId = toMD5('{}_{}'.format(comment.parent.url, user_id))
    comment.rid = comment.parent.postId
    comment.authorName = h3_authorName.getText()
    comment.content = h3_authorName.next_sibling.getText()
    if comment.content.__eq__(""):
        comment.content = "圖片"

    print("留言時間: ", main_comment.find("abbr").getText())
    comment.articleDate = speculateArticlePostDate(
        main_comment.find("abbr").getText())
    react_like_tag = main_comment.find("abbr").parent.find(
        "span", {'id': re.compile('^like_.*')})
    ufi_url = react_like_tag.find(
        "a", {"href": re.compile('^/ufi/reaction/profile/browser/.*')})
    if ufi_url:
        setReplyAttr(comment, generateMFBUrl(ufi_url.get("href")))
    print('寫入留言 : ', comment.toMap())

    reply_area = main_comment.next_sibling
    parseReplys(reply_area, comment, user_id)
Exemplo n.º 4
0
 def getCrawablePage(self, url) -> BeautifulSoup:
     print("frontPage url: ", url)
     randomSleep()
     resp = session.get(url)
     session.cookies.save()
     resp.encoding = 'utf-8'
     soup = BeautifulSoup(resp.text, features='lxml')
     pageBar = soup.find("a", text="顯示更多")
     articles = soup.findAll("div", {"role": "article"})
     for a in articles:
         try:
             abbr = a.find("abbr").getText()
             postDate = speculateArticlePostDate(abbr)
             print("postDate = ", postDate)
             if postDate >= txDate:
                 target = {
                     'url':
                     generateMFBUrl(a.find("a", text="完整動態").get("href"))
                 }
                 print(target)
             else:
                 pageBar = None
                 break
         except Exception:
             continue
     return pageBar
Exemplo n.º 5
0
 def getCrawablePage(self, url) -> BeautifulSoup:
     print("frontPage url: ", url)
     randomSleep()
     resp = session.get(url)
     session.cookies.save()
     resp.encoding = 'utf-8'
     soup = BeautifulSoup(resp.text, features='lxml')
     pageBar = getNextPageBar(soup)
     articles = soup.findAll("article", id=re.compile("^u_0_[0-9]$"))
     for a in articles:
         try:
             abbr = a.find("abbr").getText()
             postDate = speculateArticlePostDate(abbr)
             if postDate is None:
                 continue
             if postDate >= txDate:
                 target = {
                     'url':
                     generateMFBUrl(a.find("a", text="完整動態").get("href"))
                 }
                 urlList.append(target)
                 print("取得文章 postDate: ", postDate, " 取得文章 url: ", target)
             else:
                 pageBar = None
                 break
         except Exception:
             continue
     return pageBar
Exemplo n.º 6
0
def parseComments(article):
    soup = article.getAttr("soup")
    comment_tag = soup.find("div", {
        "id": "add_comment_link_placeholder"
    }).previous_sibling
    abbrs = comment_tag.findAll("abbr")
    for abbr in abbrs:
        user_comment = abbr.parent.parent.parent
        user_id = user_comment.get("id")
        replyUrl = generateMFBUrl(
            user_comment.find("a", text="回覆").get("href"))
        comment = Entity()
        comment.parent = article
        processCommentAndReply(user_id, replyUrl, comment)

    more = comment_tag.find("div", {"id": re.compile("^see_next_.*")})
    if more:
        more_url = generateMFBUrl(more.a.get("href"))
        randomSleep()
        resp = sendRequest(more_url)
        soup = BeautifulSoup(resp.text, features='lxml')
        article.setAttr("soup", soup)
        parseComments(article)
Exemplo n.º 7
0
 def getNextPage(self, pageBar) -> str:
     url = generateMFBUrl(pageBar.get("href"))
     return url