class DcardWrapper:
    def __init__(self):
        self.dcard = Dcard()
        self.word_list_dict = dict()  #a dict with {"品德": ["母豬", "妓女"...], ...}
        self.forum_list = None
        self.searching_num = 10
        self.totalPostNum = 0
        self.totalCommentNum = 0
        self.commentFromMale = 0
        self.commentFromFemale = 0
        self.raw_result = list()
        self.result_list = []  #a list of result dict
        self.currentForum = ""
        self.currentTitleTopic = ""

    def printInfo(self):
        print("")
        print("Forum: %s" % self.currentForum)
        print("Number of articles to search: %d" % self.searching_num)

    def printResult(self, toFile=sys.stdout):
        print("Forum: %s" % self.currentForum, file=toFile)
        if (self.currentTitleTopic):
            print("Finding %s in title only." % self.currentTitleTopic,
                  file=toFile)
        print("Total %d posts" % self.totalPostNum, file=toFile)
        print("Total %d comments (male %d / female %d)" %
              (self.totalCommentNum, self.commentFromMale,
               self.commentFromFemale),
              file=toFile)
        print("%-8s %8s %8s %8s %8s %8s %8s" %
              ("Word in", "Title", "Content", "Comment", "Total", "Male",
               "Female"),
              file=toFile)
        for eachType in self.result_list:
            eachType.printTypeResult(toFile)

    def getDataFromForum(self, forumName, searching_num):
        """ Download posts data from forum. """
        self.searching_num = searching_num
        self.currentForum = forumName

        if not (self.read_raw_result(forumName)):
            f = self.dcard.forums(forumName)
            print(self.searching_num,
                  "Meta collecting ...",
                  end=' ',
                  flush=True)
            m = f.get_metas(
                num=self.searching_num)  #, callback=get_words) #list
            print("Done.")
            print("")
            print("Posts collecting ...", end=' ', flush=True)
            p = self.dcard.posts(m).get(links=False)
            print("Done.")
            print("")

            count = 0
            """ Searching the word in each articles, including title, cotent and comments. """
            while count < self.searching_num or self.searching_num == -1:
                try:
                    eachPost = next(p.results)
                    self.raw_result.append(eachPost)
                except:
                    self.searching_num = len(self.raw_result)
                    break
                else:
                    print("Downloading data in %s ...  %d / %d " %
                          (forumName, count, self.searching_num),
                          end='\r',
                          flush=True)

                if (count % 20000 == 0 and count > 1):
                    self.write_raw_result(forumName)
                    self.raw_result = []
                count += 1

            print("Downloading data in %s ...  %d / %d     Done." %
                  (forumName, self.searching_num, self.searching_num),
                  end='\r',
                  flush=True)

        else:
            print("Error in getDataFromForum(): already download.")
            return False
        self.write_raw_result(forumName)

    def searchWordFromData(self, forumName, searching_num, titleTopic=""):
        self.searching_num = searching_num
        self.currentForum = forumName
        self.currentTitleTopic = titleTopic
        self.printInfo()
        print("")
        """ Download if no data found. """
        if not (self.read_raw_result(forumName)):
            self.getDataFromForum(forumName, searching_num)

        first_word = True  #To count post num and comment num
        for typeName, wordList in self.word_list_dict.items():
            typeResultWrapper = dataIO.TypeResultWrapper(typeName)
            print("")
            print("#Type: ", typeName)
            for eachWord in wordList:
                wordResultWrapper = dataIO.WordResultWrapper(eachWord)
                count = 0
                """ Searching the word in each articles, including title, cotent and comments. """
                while count < self.searching_num:
                    wordInTitles = wordInContent = wordInComment = []
                    wordInPost = 0
                    if not len(self.raw_result) == self.searching_num:
                        print(
                            "ERROR in seaching: searching num %d > data num %d"
                            % (self.searching_num, len(self.raw_result)))
                        print("Turn searching num into data num")
                        print("")
                        self.searching_num = len(self.raw_result)
                    eachPost = self.raw_result[count]
                    print("Searching %s in %s ...  %d / %d " %
                          (eachWord, forumName, count, self.searching_num),
                          end='\r',
                          flush=True)
                    """ There are errors to handle when getting large amount of data. """
                    count += 1
                    # Filter those with specific topic in title
                    if (titleTopic != ""):
                        try:
                            if not (titleTopic in eachPost['title']):
                                continue
                        except:
                            print("")
                            print("Error in title, post: ", eachPost)
                            continue

                    # Searching word in titile
                    try:
                        wordInTitles = re.findall(eachWord, eachPost['title'])
                        self.totalPostNum += 1 if first_word else 0
                    except:
                        print("")
                        print("Error key title, post: ", eachPost)
                        continue
                    if wordInTitles:
                        wordInPost = 1
                        try:
                            wordResultWrapper.addResult(
                                'title', len(wordInTitles), eachPost['gender'],
                                eachPost['createdAt'])
                        except:
                            print("")
                            print("Error in title: ", eachPost['title'])
                            continue

                    # Searching word in content of posts
                    try:
                        wordInContent = re.findall(eachWord,
                                                   eachPost['content'])
                    except:
                        print("")
                        print("Error key content, post: ", eachPost)
                        continue

                    if wordInContent:
                        wordInPost = 1
                        try:
                            wordResultWrapper.addResult(
                                'content', len(wordInContent),
                                eachPost['gender'], eachPost['createdAt'])
                        except:
                            print("")
                            print("Error in wordInContent, post: ",
                                  eachPost['content'])
                            continue

                    # Searching word in comments of posts
                    for eachComment in eachPost['comments']:
                        try:
                            if not eachComment['hidden']:
                                if first_word:
                                    self.totalCommentNum += 1
                                    if eachComment['gender'] == 'M':
                                        self.commentFromMale += 1
                                    elif eachComment['gender'] == 'F':
                                        self.commentFromFemale += 1
                                    else:
                                        #Some comment without gender or from official
                                        self.totalCommentNum -= 1

                                wordInComment = re.findall(
                                    eachWord, eachComment['content'])
                                if wordInComment:
                                    wordInPost = 1
                                    wordResultWrapper.addResult(
                                        'comment', len(wordInComment),
                                        eachComment['gender'],
                                        eachComment['createdAt'])
                        except:
                            print("")
                            print("Error in searching eachComment: ",
                                  eachComment)
                    if (wordInPost):
                        wordResultWrapper.postNum += 1

                print(
                    "Searching %s in %s ...  %d / %d                  Done." %
                    (eachWord, forumName, self.searching_num,
                     self.searching_num))
                typeResultWrapper.addWord(wordResultWrapper)
                first_word = False
            self.result_list.append(typeResultWrapper)
        print("End of searching")
        print("")
        self.printResult()

        directory = 'Dcard/result/'
        if not os.path.exists(directory):
            os.makedirs(directory)
        filename = directory + '%s_%d_result%s.txt' % (
            self.currentForum, self.searching_num, titleTopic)
        if os.path.isfile(filename):
            print("Warning: file %s already exists" % filename)
            input("Continue?")
        print("File %s written." % filename)

        with open(filename, 'w') as outFile:
            self.printResult(outFile)
        #self.writeResult()

    def write_raw_result(self, forumName):
        if not os.path.exists('./data'):
            os.makedirs('data')
        filename = 'data/%s_%d_raw_result.dat' % (forumName,
                                                  self.searching_num)
        if not os.path.isfile(filename):
            dataIO.writePickle(filename, self.raw_result)
        else:
            print("File %s already exists" % filename)

    def read_raw_result(self, forumName):
        filename = 'data/%s_%d_raw_result.dat' % (forumName,
                                                  self.searching_num)
        if os.path.isfile(filename):
            self.raw_result = dataIO.readPickle(filename)
            return True
        else:
            return False
Пример #2
0
@author: daniel
"""

from dcard import Dcard
from dateutil.parser import parse
from dateutil.tz import tzutc
from datetime import datetime
import json

def after_date(after):
    return lambda metas: [m for m in metas if parse(m['createdAt'][:-1]) >= parse(after)]

board_lists = ['ntu', 'ncku', 'nthu', 'nctu']

dcard = Dcard()

for board_name in board_lists:
    print('Crawling '+board_name+' ...')
    
    n = 1300 if board_name == 'ncku' else 300
    
    metas = dcard.forums(board_name).get_metas(num=n, sort='new', callback=after_date('2018-8-1'))
    print('Totally {} posts ...'.format(len(metas)))
    print('Oldest post date: {}'.format(metas[-1]['createdAt']))
    print()
    
    posts = dcard.posts(metas).get()
    
    with open(board_name+'.json', 'w', encoding='utf-8') as f:
        json.dump(posts.result(), f, ensure_ascii=False)
Пример #3
0
from dcard import Dcard
import json

if __name__ == '__main__':
    dcard = Dcard()

    def tag():
        number = 4500
        with open('topic.txt', 'w+', encoding='utf-8-sig') as s:
            article_metas = dcard.forums('ncku').get_metas(num=number,
                                                           sort='new')
            i = 0
            count = 0
            A = set()  #紀錄tag的集合
            topic = {}  #建立tag的dict,key為tag,value為數量

            while (article_metas[i]["id"] >= 228480142):  #首篇有tag的文章
                if article_metas[i]["topics"]:  # 當topic不為空集合時
                    for j in range(len(article_metas[i]["topics"])):
                        tags = article_metas[i]["topics"][j]
                    count += 1
                    A.add(tags)

                    if tags in topic.keys():  # 如果已有該tag
                        topic[tags] += 1
                    else:
                        topic[tags] = 1
                i += 1

            print("不重複tag數", len(A))  # 不重複tag數為len(A)個
            print("總tag數", count)  # 總tag數為count個
Пример #4
0
and export the result as a .json format file.
'''
import time
import json
from dcard import Dcard
import numpy as np
import pandas as pd
import datetime
import re

start = time.time()
print('The Dcard Crawler program starts...')
today = str(datetime.date.today())
today = today.replace('-', '_')

dcard = Dcard()
forum = dcard.forums('food')  # <= 改''內容可以指定要爬的版
dcard_json = forum.get_metas(
    num=1000)  # <= 改 (num=?) 內容可以指定要爬的篇數,下載後越新的文章排在越前面
dcard_json = dcard_json[::-1]  # <= 下載後越新的文章排在越後面
#export as the .json format
file_name = 'raw_dcard_food{}.json'.format(today)
with open(file_name, 'w') as f:  # <= 改‘’內容可以自訂存擋名稱
    json.dump(dcard_json, f)

dcard_food = pd.read_json(file_name, encoding='utf-8')

# Parse the createAt column as a date-time format

date_list = []
year_list = []
Пример #5
0
from dcard import Dcard
import json, sys
topic = sys.argv[1]
num = int(sys.argv[2])

dcard = Dcard()
ariticle_metas = dcard.forums(topic).get_metas(num=num, sort='new')
articles = dcard.posts(ariticle_metas).get()
with open('result.json', 'w', encoding='utf-8') as f:
    json.dump(articles.result(), f, ensure_ascii=False)
with open('output.kcm', 'w', encoding='utf-8') as f:
    for i in articles.results:
        f.write(i['title'] + '\n' + i['content'] +
                '\n'.join(map(lambda x: x.get('content', ''), i['comments'])) +
                '\n')
Пример #6
0
from dcard import Dcard
target = '女友'


def hot(metas):
    return [m for m in metas if target in m['tags']]


d = Dcard()
f = d.forums('photography')  # 攝影版
m = f.get_metas(num=50, callback=hot)  #list
p = d.posts(m).get(comments=False, links=False)
r = p.parse_resources()  #list: try r[0][1]
done, fails = p.download(r)
print('Got %d pics' % done if len(fails) == 0 else 'Error!')
Пример #7
0
def dcard():
    return Dcard()