def get_forums(forum, article_num, comment_top_k): dcard = Dcard() #to get the most popular article metas = dcard.forums(forum).get_metas(num=article_num) posts = dcard.posts(metas).get() articles = dcard.posts(metas).get(content=True, links=False) article_dic = { } # {title: {'content': '', 'comment' : [top1, top2, top3 ....], 'tag': []}} for article in articles.result(): if 'content' in article.keys() and 'tags' in article.keys( ) and 'comments' in article.keys(): if article['content'] != '' and len( article['comments'] ) > 0 and 'error' not in article['comments']: print(article['comments']) tmp_content = article['content'] tmp_content = tmp_content.replace('\n', '') tmp_content = tmp_content.replace(' ', '') tmp_content = tmp_content.replace('/', '') tmp_content = tmp_content.replace('>', '') tmp_content = tmp_content.replace('<', '') tmp_content = tmp_content.replace('.', '') tmp_tags = ''.join(v for v in article['tags']) article_dic[article['title']] = { 'content': tmp_content, 'tag': tmp_tags, 'comment': [] } all_comments = copy.copy(article['comments']) all_comments = [ v for v in all_comments if 'likeCount' in v.keys() ] sorted_all_comments = sorted(all_comments, key=itemgetter('likeCount'), reverse=True) sorted_all_comments = sorted_all_comments[:comment_top_k] for c in sorted_all_comments: article_dic[article['title']]['comment'].append( c['content']) return article_dic
def main(): dcard = Dcard() forums = dcard.forums.get() # forums = dcard.forums.get(no_school=True) # ariticle_metas = dcard.forums('mood').get_metas(num=20, sort='popular') ariticle_metas = dcard.forums('mood').get_metas(num=1000000, sort='new') # article length print(len(ariticle_metas)) ids = [meta['id'] for meta in ariticle_metas] articles = dcard.posts(ids).get(comments=False, links=False) data = {} index = 0 for article in articles.results: try: res = [] data['title'] = article['title'] data['content'] = article['content'] res.append(data) ouput_board_page_articles_json(data['title'], res, index) print(str(index) + " page finish!") index += 1 except: print("this article no exist!")
from dcard import Dcard def 先過濾出標題含有作品關鍵字(metas): return [meta for meta in metas if '' in meta['title']] if __name__ == '__main__': dcard = Dcard() metas = dcard.forums('sex').get_metas(num=100, callback=先過濾出標題含有作品關鍵字) posts = dcard.posts(metas).get(comments=False, links=False) resources = posts.parse_resources() status, fails = posts.download(resources) print('成功下載!' if len(fails) == 0 else '出了點錯下載不完全喔')
@author: daniel """ from dcard import Dcard from dateutil.parser import parse from dateutil.tz import tzutc from datetime import datetime import json def after_date(after): return lambda metas: [m for m in metas if parse(m['createdAt'][:-1]) >= parse(after)] board_lists = ['ntu', 'ncku', 'nthu', 'nctu'] dcard = Dcard() for board_name in board_lists: print('Crawling '+board_name+' ...') n = 1300 if board_name == 'ncku' else 300 metas = dcard.forums(board_name).get_metas(num=n, sort='new', callback=after_date('2018-8-1')) print('Totally {} posts ...'.format(len(metas))) print('Oldest post date: {}'.format(metas[-1]['createdAt'])) print() posts = dcard.posts(metas).get() with open(board_name+'.json', 'w', encoding='utf-8') as f: json.dump(posts.result(), f, ensure_ascii=False)
class DcardWrapper: def __init__(self): self.dcard = Dcard() self.word_list_dict = dict() #a dict with {"品德": ["母豬", "妓女"...], ...} self.forum_list = None self.searching_num = 10 self.totalPostNum = 0 self.totalCommentNum = 0 self.commentFromMale = 0 self.commentFromFemale = 0 self.raw_result = list() self.result_list = [] #a list of result dict self.currentForum = "" self.currentTitleTopic = "" def printInfo(self): print("") print("Forum: %s" % self.currentForum) print("Number of articles to search: %d" % self.searching_num) def printResult(self, toFile=sys.stdout): print("Forum: %s" % self.currentForum, file=toFile) if (self.currentTitleTopic): print("Finding %s in title only." % self.currentTitleTopic, file=toFile) print("Total %d posts" % self.totalPostNum, file=toFile) print("Total %d comments (male %d / female %d)" % (self.totalCommentNum, self.commentFromMale, self.commentFromFemale), file=toFile) print("%-8s %8s %8s %8s %8s %8s %8s" % ("Word in", "Title", "Content", "Comment", "Total", "Male", "Female"), file=toFile) for eachType in self.result_list: eachType.printTypeResult(toFile) def getDataFromForum(self, forumName, searching_num): """ Download posts data from forum. """ self.searching_num = searching_num self.currentForum = forumName if not (self.read_raw_result(forumName)): f = self.dcard.forums(forumName) print(self.searching_num, "Meta collecting ...", end=' ', flush=True) m = f.get_metas( num=self.searching_num) #, callback=get_words) #list print("Done.") print("") print("Posts collecting ...", end=' ', flush=True) p = self.dcard.posts(m).get(links=False) print("Done.") print("") count = 0 """ Searching the word in each articles, including title, cotent and comments. """ while count < self.searching_num or self.searching_num == -1: try: eachPost = next(p.results) self.raw_result.append(eachPost) except: self.searching_num = len(self.raw_result) break else: print("Downloading data in %s ... %d / %d " % (forumName, count, self.searching_num), end='\r', flush=True) if (count % 20000 == 0 and count > 1): self.write_raw_result(forumName) self.raw_result = [] count += 1 print("Downloading data in %s ... %d / %d Done." % (forumName, self.searching_num, self.searching_num), end='\r', flush=True) else: print("Error in getDataFromForum(): already download.") return False self.write_raw_result(forumName) def searchWordFromData(self, forumName, searching_num, titleTopic=""): self.searching_num = searching_num self.currentForum = forumName self.currentTitleTopic = titleTopic self.printInfo() print("") """ Download if no data found. """ if not (self.read_raw_result(forumName)): self.getDataFromForum(forumName, searching_num) first_word = True #To count post num and comment num for typeName, wordList in self.word_list_dict.items(): typeResultWrapper = dataIO.TypeResultWrapper(typeName) print("") print("#Type: ", typeName) for eachWord in wordList: wordResultWrapper = dataIO.WordResultWrapper(eachWord) count = 0 """ Searching the word in each articles, including title, cotent and comments. """ while count < self.searching_num: wordInTitles = wordInContent = wordInComment = [] wordInPost = 0 if not len(self.raw_result) == self.searching_num: print( "ERROR in seaching: searching num %d > data num %d" % (self.searching_num, len(self.raw_result))) print("Turn searching num into data num") print("") self.searching_num = len(self.raw_result) eachPost = self.raw_result[count] print("Searching %s in %s ... %d / %d " % (eachWord, forumName, count, self.searching_num), end='\r', flush=True) """ There are errors to handle when getting large amount of data. """ count += 1 # Filter those with specific topic in title if (titleTopic != ""): try: if not (titleTopic in eachPost['title']): continue except: print("") print("Error in title, post: ", eachPost) continue # Searching word in titile try: wordInTitles = re.findall(eachWord, eachPost['title']) self.totalPostNum += 1 if first_word else 0 except: print("") print("Error key title, post: ", eachPost) continue if wordInTitles: wordInPost = 1 try: wordResultWrapper.addResult( 'title', len(wordInTitles), eachPost['gender'], eachPost['createdAt']) except: print("") print("Error in title: ", eachPost['title']) continue # Searching word in content of posts try: wordInContent = re.findall(eachWord, eachPost['content']) except: print("") print("Error key content, post: ", eachPost) continue if wordInContent: wordInPost = 1 try: wordResultWrapper.addResult( 'content', len(wordInContent), eachPost['gender'], eachPost['createdAt']) except: print("") print("Error in wordInContent, post: ", eachPost['content']) continue # Searching word in comments of posts for eachComment in eachPost['comments']: try: if not eachComment['hidden']: if first_word: self.totalCommentNum += 1 if eachComment['gender'] == 'M': self.commentFromMale += 1 elif eachComment['gender'] == 'F': self.commentFromFemale += 1 else: #Some comment without gender or from official self.totalCommentNum -= 1 wordInComment = re.findall( eachWord, eachComment['content']) if wordInComment: wordInPost = 1 wordResultWrapper.addResult( 'comment', len(wordInComment), eachComment['gender'], eachComment['createdAt']) except: print("") print("Error in searching eachComment: ", eachComment) if (wordInPost): wordResultWrapper.postNum += 1 print( "Searching %s in %s ... %d / %d Done." % (eachWord, forumName, self.searching_num, self.searching_num)) typeResultWrapper.addWord(wordResultWrapper) first_word = False self.result_list.append(typeResultWrapper) print("End of searching") print("") self.printResult() directory = 'Dcard/result/' if not os.path.exists(directory): os.makedirs(directory) filename = directory + '%s_%d_result%s.txt' % ( self.currentForum, self.searching_num, titleTopic) if os.path.isfile(filename): print("Warning: file %s already exists" % filename) input("Continue?") print("File %s written." % filename) with open(filename, 'w') as outFile: self.printResult(outFile) #self.writeResult() def write_raw_result(self, forumName): if not os.path.exists('./data'): os.makedirs('data') filename = 'data/%s_%d_raw_result.dat' % (forumName, self.searching_num) if not os.path.isfile(filename): dataIO.writePickle(filename, self.raw_result) else: print("File %s already exists" % filename) def read_raw_result(self, forumName): filename = 'data/%s_%d_raw_result.dat' % (forumName, self.searching_num) if os.path.isfile(filename): self.raw_result = dataIO.readPickle(filename) return True else: return False
from dcard import Dcard import json, sys topic = sys.argv[1] num = int(sys.argv[2]) dcard = Dcard() ariticle_metas = dcard.forums(topic).get_metas(num=num, sort='new') articles = dcard.posts(ariticle_metas).get() with open('result.json', 'w', encoding='utf-8') as f: json.dump(articles.result(), f, ensure_ascii=False) with open('output.kcm', 'w', encoding='utf-8') as f: for i in articles.results: f.write(i['title'] + '\n' + i['content'] + '\n'.join(map(lambda x: x.get('content', ''), i['comments'])) + '\n')
from dcard import Dcard target = '女友' def hot(metas): return [m for m in metas if target in m['tags']] d = Dcard() f = d.forums('photography') # 攝影版 m = f.get_metas(num=50, callback=hot) #list p = d.posts(m).get(comments=False, links=False) r = p.parse_resources() #list: try r[0][1] done, fails = p.download(r) print('Got %d pics' % done if len(fails) == 0 else 'Error!')