class UserInfoCrawl(Spider): name = "weibo_user_info" # allowed_domains = ["weibo.cn"] def __init__(self, name="*****@*****.**", password="******", uid="09424248189", *args, **kwargs): super(UserInfoCrawl, self).__init__(*args, **kwargs) self.uid = uid self.start_urls = ["http://weibo.com"] self.allowed_domains = ["weibo.com", "weibo.cn"] self.url_base = "http://weibo.cn" self.first_flag_info = True # 不爬取自己的微博 self.first_flag_home = True # 处理自己资料的时候和其他账户有所不一 if os.path.exists("weibocookie.json"): with open("weibocookie.json", "r") as f: self.cookie = json.load(f) else: self.weibo = WeiboLogin() self.session = self.weibo.login(name, password) cookiejar = requests.utils.dict_from_cookiejar( self.session.cookies) # Set sina weibo cookie self.cookie = { 'ALF': cookiejar['ALF'], 'sso_info': cookiejar['sso_info'], 'SUB': cookiejar['SUB'], 'SUBP': cookiejar['SUBP'], 'SUE': cookiejar['SUE'], 'SUHB': cookiejar['SUHB'], 'SUP': cookiejar['SUP'], 'SUS': cookiejar['SUS'] } with open("weibocookie.json", "w") as f: json.dump(self.cookie, f) def start_requests(self): # Parse weibo homepage home_url = "http://weibo.cn/u/%s" % self.uid yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def _parse_homepage(self, response): html = response.body soup = BeautifulSoup(html, "lxml") # 粉丝数 fans_count, uid = self.get_fans_count(soup) # 微博数量 weibo_count = self.get_weibo_count(soup) # 关注 follow_count, follow_url = self.get_follows(soup) # 微博,只爬第一条 weibo_item = self.parse_weibo_context(soup, uid) if weibo_item is not None: yield weibo_item weibo_social = WeiboSocialConnection() weibo_social["user_id"] = uid weibo_social["weibo"] = weibo_count weibo_social["fans"] = fans_count weibo_social["follow"] = follow_count if weibo_count > 10: yield weibo_social # 个人资料 detail_url_ele = soup.find("a", text=u"资料") if detail_url_ele: detail_url = self.url_base + detail_url_ele["href"] yield Request(url=detail_url, cookies=self.cookie, callback=self.parse_info, errback=self.parse_error, priority=1) if follow_url: yield Request(url=follow_url, cookies=self.cookie, callback=self.parse_follow, errback=self.parse_error) def parse_error(self, response): logger.error("post:%s" % response.url) def parse_info(self, response): html = response.body soup = BeautifulSoup(html, "lxml") info_tip_ele = soup.find("div", text=u"基本信息") uid = self.get_uid_from_response(response) info = {} if info_tip_ele: info_ele = info_tip_ele.next_sibling if self.first_flag_info: self.first_flag_info = False # info_eles = info_ele.find_all("a") # for ele in info_eles: # if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]: # info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8") # print ele.text, ele.next_sibling else: info_eles = info_ele.strings user_info = WeiboUserInfoItem() user_info["user_id"] = uid for ele in info_eles: el = ele.split(":") if len(el) == 2 and el[0] in [ u"昵称", u"性别", u"地区", u"生日", u"简介" ]: info[el[0]] = el[1] info_item = el[1].encode("utf-8") if el[0] == u"昵称": user_info["user_name"] = info_item elif el[0] == u"性别": user_info["sex"] = info_item elif el[0] == u"地区": region = info_item.split(" ") if len(region) == 1: user_info["province"] = "" user_info["city"] = region[0] else: user_info["province"] = region[0] user_info["city"] = region[1] elif el[0] == u"生日": if len(info_item.split("-")) < 3: user_info["birthday"] = "2050-" + info_item else: user_info["birthday"] = info_item p = re.compile(r"^\d{4}-\d{2}-\d{2}$") if not p.findall(user_info["birthday"]): user_info["birthday"] = None elif el[0] == u"简介": user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") yield user_info def parse_follow(self, response): html = response.body soup = BeautifulSoup(html, "lxml") table_eles = soup.find_all("table") for ele in table_eles: follower_url = ele.find("a")["href"] yield Request(url=follower_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def get_uid_from_response(self, response): if isinstance(response, str): url = response else: url = response.url pattern = re.compile(r'/(\d+)/?') res = re.findall(pattern, url) id = 0 if res: id = int(res[0]) # print "id:", id return id def parse_weibo_context(self, soup, uid): weibo_info = WeiboItem() if self.first_flag_home: self.first_flag_home = False return None else: contexts = soup.find_all("div", class_="c") for item in contexts: try: context = item.find("span", class_="ctt") if not context: continue weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") parent_ele = context.parent.parent like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$")) relay_ele = parent_ele.find( text=re.compile(u"^转发\[\d*\]$")) comment_ele = parent_ele.find( text=re.compile(u"^评论\[\d*\]$")) issue_time_ele = parent_ele.find("span", class_="ct") issue_time = issue_time_ele.text issue_time = issue_time.encode("utf-8") issue = issue_time.split("来自") issue_datetime = "" if len(issue) > 0: if "分钟" in issue[0]: min = filter(str.isdigit, issue[0]) t = datetime.datetime.now() - datetime.timedelta( minutes=int(min)) issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S") elif "今天" in issue[0]: time = issue[0].replace("今天 ", "").replace( "\xc2\xa0", "") issue_datetime = datetime.datetime.now().strftime( "%Y-%m-%d ") + time else: issue_datetime = issue[0].replace( "月", "-").replace("日", "").replace("\xc2\xa0", "") if issue[0].count("-") < 2: issue_datetime = datetime.datetime.now( ).strftime("%Y-") + issue_datetime issue_device = issue[1] if len(issue) > 1 else None weibo_info["context"] = weibo_text weibo_info["user_id"] = uid weibo_info["issue_time"] = issue_datetime.strip() weibo_info["get_time"] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8")) weibo_info["relay_count"] = filter( str.isdigit, relay_ele.encode("utf-8")) weibo_info["comment_count"] = filter( str.isdigit, comment_ele.encode("utf-8")) weibo_info["device"] = issue_device # print issue_datetime, issue_device, weibo_text # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8") return weibo_info # 只爬去第一条微博 except Exception, e: logger.error(e)
#coding:utf-8 import urllib2 import post_encode from weibo_login import WeiboLogin import get_weibo if __name__ == '__main__': Login = WeiboLogin('17089368196', 'tttt5555') if Login.login() == True: print "登录成功" #可以根据page来循环以便达到爬取多页的目的 html = urllib2.urlopen("http://s.weibo.com/weibo/%25E5%2591%25A8%25E6%2589%25AC%25E9%259D%2592&page=3").read() #调用解析html内容的函数 get_weibo.write_all_info(html)
class WeiboCrawler(): ''' crawl weibo using keywords ''' def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD): # login to sinaweibo self.driver = webdriver.PhantomJS() self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization if self.wl.login(): logging.info('login successfully') else: logging.info('login faied') sys.exit(1) self.sk = search_key.strip() return def __del__(self): self.driver.quit() return def crawl(self, page_count=1, comments=False): ''' crawl the weibo using the keywords page_count: how many pages would be crawled ''' self.results = [] # get the mids from each result page pages = list(range(1, page_count+1)) random.shuffle(pages) for t in ('hot', 'time'): for i in pages: url_to_crawl = self.get_search_url(i) logging.info('crawling page {}:{}'.format(i, url_to_crawl)) self.driver.get(url_to_crawl) # wait the page loading the content try: element = WebDriverWait(self.driver, 5).until( lambda x: x.find_elements_by_class_name('feed_list') ) except TimeoutException: logging.info('there is no weibo content in {}'.format(url_to_crawl)) logging.info('you are considered as a robot') logging.info(self.driver.current_url) self.driver.get_screenshot_as_file('./screenshot/error.png') # let user input the verification code verify_user(self.driver, 'search') # break weibo_list = self.get_weibo_list(self.driver.page_source) # mid is used to crawl the original weibo content, using batch mode self.results.extend(weibo_list) # sleep some time to prevent hitting too much # time.sleep(1) else: continue break # for r in results: # logging.info_dict(r) logging.info('total result {}'.format(len(self.results))) if comments: logging.info('crawling the comments') self.crawl_comments() return def get_search_url(self, page=1, w_type='hot'): ''' compose a search url based on page_num and weibo type ''' # logging.info('generating the url') url='' url += 'http://' url += search_domain url += '/wb' url += urllib.parse.quote('/'+self.sk) url += '&' url += urllib.parse.urlencode([ ('page', page), ('xsort', w_type) ]) return url def get_weibo_list(self, content): ''' parse the weibo content in the current result page content: the source page of the keywords result return: a list of weibo object ''' weibo_list = [] soup = BeautifulSoup(content, 'html5lib') for t in soup.find_all('dl', class_='feed_list'): if t.has_attr('mid'): weibo = self.parse_weibo(t) if weibo: weibo_list.append(weibo) logging.info('There are {} weibo on this page'.format(len(weibo_list))) return weibo_list def parse_weibo(self, t): ''' parse weibo object from html t: the tag object that has weibo content Return weibo object ''' weibo = {} try: weibo['keywords'] = self.sk.split(' ') #keywords is a list of words weibo['mid'] = t['mid'] # the user name weibo['screen_name'] = t.find(name='dt', class_='face').find('a').get('title') weibo['user_profile'] = t.find(name='dt', class_='face').find('a').get('href') # the content of weibo weibo['text'] = t.find(name='dd', class_='content').find('em').get_text().strip() # the source url of the weibo weibo['source_url'] = t.find(name='a', class_='date').get('href').strip() logging.info(weibo['source_url']) # logging.info(weibo['text']) # meta data epoch_length = len(str(int(time.time()))) time_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find(name='a', class_='date').get('date')[0:epoch_length] time_now = time.localtime(int(time_str)) weibo['created_at'] = datetime.datetime(*time_now[0:6]) weibo['source'] = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('a', rel='nofollow').string.strip() pop_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('span').get_text().strip().replace('\n', '') pop_type = { # key: source representation, value: attr '赞': 'like_count', '转发': 'repost_count', '评论': 'comment_count' } for key in list(pop_type.keys()): pattern = re.compile(r'.*(%s\((\d+)\)).*' % key) match = pattern.match(pop_str) if match: # logging.info match.group(1) # logging.info match.group(2) weibo[pop_type[key]] = int(match.group(2)) else: # logging.info key, 'not found.' weibo[pop_type[key]] = 0 except Exception as e: logging.info(e) return None # logging.info_dict(weibo) return weibo def save(self, dist_dir='result'): ''' save the search results to file ''' if dist_dir not in os.listdir(os.curdir): os.mkdir(dist_dir) for w in self.results: file_name = ''.join([ '_'.join([k for k in w['keywords']]), w['mid'] ]) file_name += '.txt' f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8') json.dump(w, f, ensure_ascii = False, default=json_util.default, indent = 2) # logging.info(w['text']) logging.info('writed to file {}'.format(file_name)) return def crawl_comments(self): ''' crawl the comments after getting all the results and update the results list --> self ''' client = self.wl.authorize_app() if client: for w in self.results: # logging.info(w['mid']) w['comments'] = [] crawler = WeiboCommentsCrawler(client, weibo_mid = w['mid']) r = crawler.crawl() # filter out the unrelated fields for c in r: c.pop('status') w['comments'].extend(r) else: logging.error('认证失败,不能获取评论列表') return
class UserInfoCrawl(Spider): name = "weibo_user_info" # allowed_domains = ["weibo.cn"] def __init__(self, name="*****@*****.**", password="******", uid="1709818975", *args, **kwargs): super(UserInfoCrawl, self).__init__(*args, **kwargs) self.uid = uid self.start_urls = ["http://weibo.com"] self.allowed_domains = ["weibo.com", "weibo.cn"] self.url_base = "http://weibo.cn" self.first_flag_info = True # 不爬取自己的微博 self.first_flag_home = True # 处理自己资料的时候和其他账户有所不一 if os.path.exists("weibocookie.json"): with open("weibocookie.json", "r") as f: self.cookie = json.load(f) else: self.weibo = WeiboLogin() self.session = self.weibo.login(name, password) cookiejar = requests.utils.dict_from_cookiejar(self.session.cookies) # Set sina weibo cookie self.cookie = {'ALF': cookiejar['ALF'], 'sso_info': cookiejar['sso_info'], 'SUB': cookiejar['SUB'], 'SUBP': cookiejar['SUBP'], 'SUE': cookiejar['SUE'], 'SUHB': cookiejar['SUHB'], 'SUP': cookiejar['SUP'], 'SUS': cookiejar['SUS']} with open("weibocookie.json", "w") as f: json.dump(self.cookie, f) def start_requests(self): # Parse weibo homepage home_url = "http://weibo.cn/u/%s" % self.uid yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def _parse_homepage(self, response): html = response.body soup = BeautifulSoup(html, "lxml") # 粉丝数 fans_count, uid = self.get_fans_count(soup) # 微博数量 weibo_count = self.get_weibo_count(soup) # 关注 follow_count, follow_url = self.get_follows(soup) # 微博,只爬第一条 weibo_item = self.parse_weibo_context(soup, uid) if weibo_item is not None: yield weibo_item weibo_social = WeiboSocialConnection() weibo_social["user_id"] = uid weibo_social["weibo"] = weibo_count weibo_social["fans"] = fans_count weibo_social["follow"] = follow_count if weibo_count > 10: yield weibo_social # 个人资料 detail_url_ele = soup.find("a", text=u"资料") if detail_url_ele: detail_url = self.url_base + detail_url_ele["href"] yield Request(url=detail_url, cookies=self.cookie, callback=self.parse_info, errback=self.parse_error, priority=1) if follow_url: yield Request(url=follow_url, cookies=self.cookie, callback=self.parse_follow, errback=self.parse_error) def parse_error(self, response): logger.error("post:%s" % response.url) def parse_info(self, response): html = response.body soup = BeautifulSoup(html, "lxml") info_tip_ele = soup.find("div", text=u"基本信息") uid = self.get_uid_from_response(response) info = {} if info_tip_ele: info_ele = info_tip_ele.next_sibling if self.first_flag_info: self.first_flag_info = False # info_eles = info_ele.find_all("a") # for ele in info_eles: # if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]: # info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8") # print ele.text, ele.next_sibling else: info_eles = info_ele.strings user_info = WeiboUserInfoItem() user_info["user_id"] = uid for ele in info_eles: el = ele.split(":") if len(el) == 2 and el[0] in [u"昵称", u"性别", u"地区", u"生日", u"简介"]: info[el[0]] = el[1] info_item = el[1].encode("utf-8") if el[0] == u"昵称": user_info["user_name"] = info_item elif el[0] == u"性别": user_info["sex"] = info_item elif el[0] == u"地区": region = info_item.split(" ") if len(region) == 1: user_info["province"] = "" user_info["city"] = region[0] else: user_info["province"] = region[0] user_info["city"] = region[1] elif el[0] == u"生日": if len(info_item.split("-")) < 3: user_info["birthday"] = "2050-" + info_item else: user_info["birthday"] = info_item p = re.compile(r"^\d{4}-\d{2}-\d{2}$") if not p.findall(user_info["birthday"]): user_info["birthday"] = None elif el[0] == u"简介": user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") yield user_info def parse_follow(self, response): html = response.body soup = BeautifulSoup(html, "lxml") table_eles = soup.find_all("table") for ele in table_eles: follower_url = ele.find("a")["href"] yield Request(url=follower_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error) def get_uid_from_response(self, response): if isinstance(response, str): url = response else: url = response.url pattern = re.compile(r'/(\d+)/?') res = re.findall(pattern, url) id = 0 if res: id = int(res[0]) # print "id:", id return id def parse_weibo_context(self, soup, uid): weibo_info = WeiboItem() if self.first_flag_home: self.first_flag_home = False return None else: contexts = soup.find_all("div", class_="c") for item in contexts: try: context = item.find("span", class_="ctt") if not context: continue weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\ replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\ replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "") parent_ele = context.parent.parent like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$")) relay_ele = parent_ele.find(text=re.compile(u"^转发\[\d*\]$")) comment_ele = parent_ele.find(text=re.compile(u"^评论\[\d*\]$")) issue_time_ele = parent_ele.find("span", class_="ct") issue_time = issue_time_ele.text issue_time = issue_time.encode("utf-8") issue = issue_time.split("来自") issue_datetime = "" if len(issue) > 0: if "分钟" in issue[0]: min = filter(str.isdigit, issue[0]) t = datetime.datetime.now() - datetime.timedelta(minutes=int(min)) issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S") elif "今天" in issue[0]: time = issue[0].replace("今天 ", "").replace("\xc2\xa0", "") issue_datetime = datetime.datetime.now().strftime("%Y-%m-%d ") + time else: issue_datetime = issue[0].replace("月", "-").replace("日", "").replace("\xc2\xa0", "") if issue[0].count("-") < 2: issue_datetime =datetime.datetime.now().strftime("%Y-") + issue_datetime issue_device = issue[1] if len(issue) > 1 else None weibo_info["context"] = weibo_text weibo_info["user_id"] = uid weibo_info["issue_time"] = issue_datetime.strip() weibo_info["get_time"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8")) weibo_info["relay_count"] = filter(str.isdigit, relay_ele.encode("utf-8")) weibo_info["comment_count"] = filter(str.isdigit, comment_ele.encode("utf-8")) weibo_info["device"] = issue_device # print issue_datetime, issue_device, weibo_text # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8") return weibo_info # 只爬去第一条微博 except Exception, e: logger.error(e)
class WeiboCrawler(): ''' crawl weibo using keywords ''' def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD): # login to sinaweibo self.driver = webdriver.PhantomJS() self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization if self.wl.login(): logging.info('login successfully') else: logging.info('login faied') sys.exit(1) self.sk = search_key.strip() return def __del__(self): self.driver.quit() return def crawl(self, page_count=1, comments=False): ''' crawl the weibo using the keywords page_count: how many pages would be crawled ''' self.results = [] # get the mids from each result page pages = list(range(1, page_count + 1)) random.shuffle(pages) for t in ('hot', 'time'): for i in pages: url_to_crawl = self.get_search_url(i) logging.info('crawling page {}:{}'.format(i, url_to_crawl)) self.driver.get(url_to_crawl) # wait the page loading the content try: element = WebDriverWait(self.driver, 5).until( lambda x: x.find_elements_by_class_name('feed_list')) except TimeoutException: logging.info( 'there is no weibo content in {}'.format(url_to_crawl)) logging.info('you are considered as a robot') logging.info(self.driver.current_url) self.driver.get_screenshot_as_file( './screenshot/error.png') # let user input the verification code verify_user(self.driver, 'search') # break weibo_list = self.get_weibo_list( self.driver.page_source ) # mid is used to crawl the original weibo content, using batch mode self.results.extend(weibo_list) # sleep some time to prevent hitting too much # time.sleep(1) else: continue break # for r in results: # logging.info_dict(r) logging.info('total result {}'.format(len(self.results))) if comments: logging.info('crawling the comments') self.crawl_comments() return def get_search_url(self, page=1, w_type='hot'): ''' compose a search url based on page_num and weibo type ''' # logging.info('generating the url') url = '' url += 'http://' url += search_domain url += '/wb' url += urllib.parse.quote('/' + self.sk) url += '&' url += urllib.parse.urlencode([('page', page), ('xsort', w_type)]) return url def get_weibo_list(self, content): ''' parse the weibo content in the current result page content: the source page of the keywords result return: a list of weibo object ''' weibo_list = [] soup = BeautifulSoup(content, 'html5lib') for t in soup.find_all('dl', class_='feed_list'): if t.has_attr('mid'): weibo = self.parse_weibo(t) if weibo: weibo_list.append(weibo) logging.info('There are {} weibo on this page'.format(len(weibo_list))) return weibo_list def parse_weibo(self, t): ''' parse weibo object from html t: the tag object that has weibo content Return weibo object ''' weibo = {} try: weibo['keywords'] = self.sk.split( ' ') #keywords is a list of words weibo['mid'] = t['mid'] # the user name weibo['screen_name'] = t.find(name='dt', class_='face').find('a').get('title') weibo['user_profile'] = t.find(name='dt', class_='face').find('a').get('href') # the content of weibo weibo['text'] = t.find( name='dd', class_='content').find('em').get_text().strip() # the source url of the weibo weibo['source_url'] = t.find(name='a', class_='date').get('href').strip() logging.info(weibo['source_url']) # logging.info(weibo['text']) # meta data epoch_length = len(str(int(time.time()))) time_str = t.find('dd', class_='content').find( 'p', class_='info W_linkb W_textb').find( name='a', class_='date').get('date')[0:epoch_length] time_now = time.localtime(int(time_str)) weibo['created_at'] = datetime.datetime(*time_now[0:6]) weibo['source'] = t.find('dd', class_='content').find( 'p', class_='info W_linkb W_textb').find( 'a', rel='nofollow').string.strip() pop_str = t.find('dd', class_='content').find( 'p', class_='info W_linkb W_textb').find( 'span').get_text().strip().replace('\n', '') pop_type = { # key: source representation, value: attr '赞': 'like_count', '转发': 'repost_count', '评论': 'comment_count' } for key in list(pop_type.keys()): pattern = re.compile(r'.*(%s\((\d+)\)).*' % key) match = pattern.match(pop_str) if match: # logging.info match.group(1) # logging.info match.group(2) weibo[pop_type[key]] = int(match.group(2)) else: # logging.info key, 'not found.' weibo[pop_type[key]] = 0 except Exception as e: logging.info(e) return None # logging.info_dict(weibo) return weibo def save(self, dist_dir='result'): ''' save the search results to file ''' if dist_dir not in os.listdir(os.curdir): os.mkdir(dist_dir) for w in self.results: file_name = ''.join( ['_'.join([k for k in w['keywords']]), w['mid']]) file_name += '.txt' f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8') json.dump(w, f, ensure_ascii=False, default=json_util.default, indent=2) # logging.info(w['text']) logging.info('writed to file {}'.format(file_name)) return def crawl_comments(self): ''' crawl the comments after getting all the results and update the results list --> self ''' client = self.wl.authorize_app() if client: for w in self.results: # logging.info(w['mid']) w['comments'] = [] crawler = WeiboCommentsCrawler(client, weibo_mid=w['mid']) r = crawler.crawl() # filter out the unrelated fields for c in r: c.pop('status') w['comments'].extend(r) else: logging.error('认证失败,不能获取评论列表') return