예제 #1
0
def get_forums(forum, article_num, comment_top_k):
    dcard = Dcard()
    #to get the most popular article
    metas = dcard.forums(forum).get_metas(num=article_num)
    posts = dcard.posts(metas).get()
    articles = dcard.posts(metas).get(content=True, links=False)

    article_dic = {
    }  # {title: {'content': '', 'comment' : [top1, top2, top3 ....], 'tag': []}}

    for article in articles.result():

        if 'content' in article.keys() and 'tags' in article.keys(
        ) and 'comments' in article.keys():
            if article['content'] != '' and len(
                    article['comments']
            ) > 0 and 'error' not in article['comments']:
                print(article['comments'])
                tmp_content = article['content']
                tmp_content = tmp_content.replace('\n', '')
                tmp_content = tmp_content.replace(' ', '')
                tmp_content = tmp_content.replace('/', '')
                tmp_content = tmp_content.replace('>', '')
                tmp_content = tmp_content.replace('<', '')
                tmp_content = tmp_content.replace('.', '')

                tmp_tags = ''.join(v for v in article['tags'])

                article_dic[article['title']] = {
                    'content': tmp_content,
                    'tag': tmp_tags,
                    'comment': []
                }

                all_comments = copy.copy(article['comments'])
                all_comments = [
                    v for v in all_comments if 'likeCount' in v.keys()
                ]

                sorted_all_comments = sorted(all_comments,
                                             key=itemgetter('likeCount'),
                                             reverse=True)
                sorted_all_comments = sorted_all_comments[:comment_top_k]

                for c in sorted_all_comments:
                    article_dic[article['title']]['comment'].append(
                        c['content'])

    return article_dic
예제 #2
0
def main():
    dcard = Dcard()
    forums = dcard.forums.get()
    # forums = dcard.forums.get(no_school=True)
    # ariticle_metas = dcard.forums('mood').get_metas(num=20, sort='popular')
    ariticle_metas = dcard.forums('mood').get_metas(num=1000000, sort='new')
    # article length
    print(len(ariticle_metas))
    ids = [meta['id'] for meta in ariticle_metas]
    articles = dcard.posts(ids).get(comments=False, links=False)
    data = {}
    index = 0
    for article in articles.results:
        try:
            res = []
            data['title'] = article['title']
            data['content'] = article['content']
            res.append(data)
            ouput_board_page_articles_json(data['title'], res, index)
            print(str(index) + " page finish!")
            index += 1
        except:
            print("this article no exist!")
예제 #3
0
from dcard import Dcard


def 先過濾出標題含有作品關鍵字(metas):
    return [meta for meta in metas if '' in meta['title']]


if __name__ == '__main__':

    dcard = Dcard()

    metas = dcard.forums('sex').get_metas(num=100, callback=先過濾出標題含有作品關鍵字)
    posts = dcard.posts(metas).get(comments=False, links=False)

    resources = posts.parse_resources()

    status, fails = posts.download(resources)
    print('成功下載!' if len(fails) == 0 else '出了點錯下載不完全喔')
예제 #4
0
@author: daniel
"""

from dcard import Dcard
from dateutil.parser import parse
from dateutil.tz import tzutc
from datetime import datetime
import json

def after_date(after):
    return lambda metas: [m for m in metas if parse(m['createdAt'][:-1]) >= parse(after)]

board_lists = ['ntu', 'ncku', 'nthu', 'nctu']

dcard = Dcard()

for board_name in board_lists:
    print('Crawling '+board_name+' ...')
    
    n = 1300 if board_name == 'ncku' else 300
    
    metas = dcard.forums(board_name).get_metas(num=n, sort='new', callback=after_date('2018-8-1'))
    print('Totally {} posts ...'.format(len(metas)))
    print('Oldest post date: {}'.format(metas[-1]['createdAt']))
    print()
    
    posts = dcard.posts(metas).get()
    
    with open(board_name+'.json', 'w', encoding='utf-8') as f:
        json.dump(posts.result(), f, ensure_ascii=False)
class DcardWrapper:
    def __init__(self):
        self.dcard = Dcard()
        self.word_list_dict = dict()  #a dict with {"品德": ["母豬", "妓女"...], ...}
        self.forum_list = None
        self.searching_num = 10
        self.totalPostNum = 0
        self.totalCommentNum = 0
        self.commentFromMale = 0
        self.commentFromFemale = 0
        self.raw_result = list()
        self.result_list = []  #a list of result dict
        self.currentForum = ""
        self.currentTitleTopic = ""

    def printInfo(self):
        print("")
        print("Forum: %s" % self.currentForum)
        print("Number of articles to search: %d" % self.searching_num)

    def printResult(self, toFile=sys.stdout):
        print("Forum: %s" % self.currentForum, file=toFile)
        if (self.currentTitleTopic):
            print("Finding %s in title only." % self.currentTitleTopic,
                  file=toFile)
        print("Total %d posts" % self.totalPostNum, file=toFile)
        print("Total %d comments (male %d / female %d)" %
              (self.totalCommentNum, self.commentFromMale,
               self.commentFromFemale),
              file=toFile)
        print("%-8s %8s %8s %8s %8s %8s %8s" %
              ("Word in", "Title", "Content", "Comment", "Total", "Male",
               "Female"),
              file=toFile)
        for eachType in self.result_list:
            eachType.printTypeResult(toFile)

    def getDataFromForum(self, forumName, searching_num):
        """ Download posts data from forum. """
        self.searching_num = searching_num
        self.currentForum = forumName

        if not (self.read_raw_result(forumName)):
            f = self.dcard.forums(forumName)
            print(self.searching_num,
                  "Meta collecting ...",
                  end=' ',
                  flush=True)
            m = f.get_metas(
                num=self.searching_num)  #, callback=get_words) #list
            print("Done.")
            print("")
            print("Posts collecting ...", end=' ', flush=True)
            p = self.dcard.posts(m).get(links=False)
            print("Done.")
            print("")

            count = 0
            """ Searching the word in each articles, including title, cotent and comments. """
            while count < self.searching_num or self.searching_num == -1:
                try:
                    eachPost = next(p.results)
                    self.raw_result.append(eachPost)
                except:
                    self.searching_num = len(self.raw_result)
                    break
                else:
                    print("Downloading data in %s ...  %d / %d " %
                          (forumName, count, self.searching_num),
                          end='\r',
                          flush=True)

                if (count % 20000 == 0 and count > 1):
                    self.write_raw_result(forumName)
                    self.raw_result = []
                count += 1

            print("Downloading data in %s ...  %d / %d     Done." %
                  (forumName, self.searching_num, self.searching_num),
                  end='\r',
                  flush=True)

        else:
            print("Error in getDataFromForum(): already download.")
            return False
        self.write_raw_result(forumName)

    def searchWordFromData(self, forumName, searching_num, titleTopic=""):
        self.searching_num = searching_num
        self.currentForum = forumName
        self.currentTitleTopic = titleTopic
        self.printInfo()
        print("")
        """ Download if no data found. """
        if not (self.read_raw_result(forumName)):
            self.getDataFromForum(forumName, searching_num)

        first_word = True  #To count post num and comment num
        for typeName, wordList in self.word_list_dict.items():
            typeResultWrapper = dataIO.TypeResultWrapper(typeName)
            print("")
            print("#Type: ", typeName)
            for eachWord in wordList:
                wordResultWrapper = dataIO.WordResultWrapper(eachWord)
                count = 0
                """ Searching the word in each articles, including title, cotent and comments. """
                while count < self.searching_num:
                    wordInTitles = wordInContent = wordInComment = []
                    wordInPost = 0
                    if not len(self.raw_result) == self.searching_num:
                        print(
                            "ERROR in seaching: searching num %d > data num %d"
                            % (self.searching_num, len(self.raw_result)))
                        print("Turn searching num into data num")
                        print("")
                        self.searching_num = len(self.raw_result)
                    eachPost = self.raw_result[count]
                    print("Searching %s in %s ...  %d / %d " %
                          (eachWord, forumName, count, self.searching_num),
                          end='\r',
                          flush=True)
                    """ There are errors to handle when getting large amount of data. """
                    count += 1
                    # Filter those with specific topic in title
                    if (titleTopic != ""):
                        try:
                            if not (titleTopic in eachPost['title']):
                                continue
                        except:
                            print("")
                            print("Error in title, post: ", eachPost)
                            continue

                    # Searching word in titile
                    try:
                        wordInTitles = re.findall(eachWord, eachPost['title'])
                        self.totalPostNum += 1 if first_word else 0
                    except:
                        print("")
                        print("Error key title, post: ", eachPost)
                        continue
                    if wordInTitles:
                        wordInPost = 1
                        try:
                            wordResultWrapper.addResult(
                                'title', len(wordInTitles), eachPost['gender'],
                                eachPost['createdAt'])
                        except:
                            print("")
                            print("Error in title: ", eachPost['title'])
                            continue

                    # Searching word in content of posts
                    try:
                        wordInContent = re.findall(eachWord,
                                                   eachPost['content'])
                    except:
                        print("")
                        print("Error key content, post: ", eachPost)
                        continue

                    if wordInContent:
                        wordInPost = 1
                        try:
                            wordResultWrapper.addResult(
                                'content', len(wordInContent),
                                eachPost['gender'], eachPost['createdAt'])
                        except:
                            print("")
                            print("Error in wordInContent, post: ",
                                  eachPost['content'])
                            continue

                    # Searching word in comments of posts
                    for eachComment in eachPost['comments']:
                        try:
                            if not eachComment['hidden']:
                                if first_word:
                                    self.totalCommentNum += 1
                                    if eachComment['gender'] == 'M':
                                        self.commentFromMale += 1
                                    elif eachComment['gender'] == 'F':
                                        self.commentFromFemale += 1
                                    else:
                                        #Some comment without gender or from official
                                        self.totalCommentNum -= 1

                                wordInComment = re.findall(
                                    eachWord, eachComment['content'])
                                if wordInComment:
                                    wordInPost = 1
                                    wordResultWrapper.addResult(
                                        'comment', len(wordInComment),
                                        eachComment['gender'],
                                        eachComment['createdAt'])
                        except:
                            print("")
                            print("Error in searching eachComment: ",
                                  eachComment)
                    if (wordInPost):
                        wordResultWrapper.postNum += 1

                print(
                    "Searching %s in %s ...  %d / %d                  Done." %
                    (eachWord, forumName, self.searching_num,
                     self.searching_num))
                typeResultWrapper.addWord(wordResultWrapper)
                first_word = False
            self.result_list.append(typeResultWrapper)
        print("End of searching")
        print("")
        self.printResult()

        directory = 'Dcard/result/'
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = directory + '%s_%d_result%s.txt' % (
            self.currentForum, self.searching_num, titleTopic)
        if os.path.isfile(filename):
            print("Warning: file %s already exists" % filename)
            input("Continue?")
        print("File %s written." % filename)

        with open(filename, 'w') as outFile:
            self.printResult(outFile)
        #self.writeResult()

    def write_raw_result(self, forumName):
        if not os.path.exists('./data'):
            os.makedirs('data')
        filename = 'data/%s_%d_raw_result.dat' % (forumName,
                                                  self.searching_num)
        if not os.path.isfile(filename):
            dataIO.writePickle(filename, self.raw_result)
        else:
            print("File %s already exists" % filename)

    def read_raw_result(self, forumName):
        filename = 'data/%s_%d_raw_result.dat' % (forumName,
                                                  self.searching_num)
        if os.path.isfile(filename):
            self.raw_result = dataIO.readPickle(filename)
            return True
        else:
            return False
예제 #6
0
from dcard import Dcard
import json, sys
topic = sys.argv[1]
num = int(sys.argv[2])

dcard = Dcard()
ariticle_metas = dcard.forums(topic).get_metas(num=num, sort='new')
articles = dcard.posts(ariticle_metas).get()
with open('result.json', 'w', encoding='utf-8') as f:
    json.dump(articles.result(), f, ensure_ascii=False)
with open('output.kcm', 'w', encoding='utf-8') as f:
    for i in articles.results:
        f.write(i['title'] + '\n' + i['content'] +
                '\n'.join(map(lambda x: x.get('content', ''), i['comments'])) +
                '\n')
예제 #7
0
from dcard import Dcard
target = '女友'


def hot(metas):
    return [m for m in metas if target in m['tags']]


d = Dcard()
f = d.forums('photography')  # 攝影版
m = f.get_metas(num=50, callback=hot)  #list
p = d.posts(m).get(comments=False, links=False)
r = p.parse_resources()  #list: try r[0][1]
done, fails = p.download(r)
print('Got %d pics' % done if len(fails) == 0 else 'Error!')