def commentInbox(self): cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist' json_list = [] tags = False comment_url = cr_url while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') print html break except Exception, e: print "Network Exception!!! ", e time.sleep(5) continue #finally: datas = getMatchList( html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->') # print len(datas) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, 'page_frame" title="(*)"') mid = getMatch(data, '&cid=(*)&') timestamp = getMatch(data, '<div class="WB_from S_txt2">(*) 来自') if timestamp: timestamp = long(getTimeStamp(timestamp)) else: timestamp = 0 if timestamp <= self.mlasttime: tags = True break text = getMatch(data, '<div class="WB_text">(*)</div>') if text: text = extractForHTML(text) else: text = '' r_mid = getMatch(data, 'mid=(*)&') r_uid = self.uid #commet_type = 'make' commet_type = 'receive' _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_mid': r_mid, 'root_uid': r_uid, 'weibo_type': _type, 'comment_type': commet_type, 'update_time': self.update_time } wb_json = json.dumps(wb_item) json_list.append(wb_json) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') # print next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] elif not next_pageUrl or tags: break
def atMeMicroBlog(self): pre_page = 0 page = 1 pagebar = 0 # max_page = 100 at_MBurl = 'http://weibo.com/aj/at/mblog/list?ajwvr=6&pre_page=%s&page=%s' \ '&filter_by_author=0&filter_by_type=0&is_adv=0&pagebar=%s' print at_MBurl json_list = [] tags = False while True: wbUrl = at_MBurl % (pre_page, page, pagebar) print "current url: ", wbUrl while True: try: request = urllib2.Request(wbUrl, headers=self._headers) response = urllib2.urlopen(request, timeout=60) mb_content = json.loads(response.read()) break except Exception, e: print "Network Exception!!! ", e continue # finally: html = mb_content["data"] print "html****html****html****html****html****", html # 分页 print "html_replace***html_replace***html_replace***", html.replace( '\n', '').replace(' ', '') print len(html.replace('\n', '').replace(' ', '')) print tags if html.replace('\n', '').replace(' ', '') == '' or tags: break # if page > max_page: # break elif pre_page < page: pre_page += 1 elif pre_page == page and pagebar == 0: pagebar = 1 elif pagebar == 1: pre_page = page page += 1 pagebar = 0 datas = getMatchList( html, '<div class="WB_face W_fl">(*)<div node-type="feed_list_repeat' ) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)&') nickname = getMatch(data, 'nick-name="(*)"') mid = getMatch(data, 'pubuser_nick:(*)"') timestamp = getMatch( data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3] if timestamp and timestamp.isdigit(): timestamp = long(timestamp) else: timestamp = 0 if timestamp <= self.lasttime: tags = True break text = getMatch(data, 'feed_list_content" >(*)</div>').strip() if text: text = extractForHTML(text.strip()) else: text = '' retweet = getMatch( data, 'forward_btn_text">.*?<em>(*)</em>').replace('转发', '') if retweet and retweet.isdigit(): retweet = long(retweet) else: retweet = 0 comment = getMatch( data, 'comment_btn_text">.*?<em>(*)</em>').replace('评论', '') if comment and comment.isdigit(): comment = long(comment) else: comment = 0 like = getMatch(data, 'UI_ani_praised".*?<em>(*)</em>') if like and like.isdigit(): like = long(like) else: like = 0 r_mid = getMatch(data, 'rootmid=(*)&') r_uid = self.uid _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'retweet': retweet, 'comment': comment, 'like': like, 'root_mid': r_mid, 'root_uid': r_uid, 'weibo_type': _type, 'update_time': self.update_time } wb_json = json.dumps(wb_item) json_list.append(wb_json)
def messages(self): cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1' de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d' json_list = [] tags = False comment_url = cr_url while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') break except Exception, e: print "Network Exception!!! ", e continue #print 'html:', html #finally: datas = getMatchList( html, '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->' ) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') r_uid = self.uid counts = getMatch( data, '<em class="W_new_count S_spetxt_bg">(*)</em>') if counts and counts.isdigit(): counts = long(counts) else: counts = 0 _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' while True: try: detailUrl = de_url % (uid, int(time.time() * 1000)) #print 'detail_url:', detailUrl request = urllib2.Request(detailUrl) response = urllib2.urlopen(request, timeout=60) ms_content = json.loads(response.read()) break except Exception, e: print "Network Exception!!! ", e continue #else: html = ms_content["data"]["html"] ms_datas = getMatchList( html, u'(<!-- 单行文字-->|<div class="space">).*?<!--/附件信息-->') # print datas[0] last_time = 0 for ms_data in ms_datas: mid_uid = getMatch(ms_data, 'usercard="id=(*)"') mid = getMatch(ms_data, 'mid="(*)"') timestamp = getMatch( ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>') #soup = BeautifulSoup(ms_data) #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"]) if timestamp: timestamp = long(getTimeStamp(timestamp)) last_time = timestamp else: timestamp = last_time if timestamp < self.lasttime: tags = True print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime #break next text = getMatch(ms_data, u'<div class="cont">.*?<!--/附件信息-->') if text: text = extractForHTML(text) text = commentExtract(text) if mid_uid == uid: private_type = 'receive' elif mid_uid == r_uid: private_type = 'make' else: private_type = '' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_uid': r_uid, 'weibo_type': _type, 'private_type': private_type, 'w_new_count': counts, 'update_time': self.update_time } wb_json = json.dumps(wb_item) #print 'wb_json:::',wb_json json_list.append(wb_json)
mid = getMatch(data, '&cid=(*)&') timestamp = getMatch( data, '<div class="WB_from S_txt2">(*) 来自') if timestamp: timestamp = long(getTimeStamp(timestamp)) else: timestamp = 0 if timestamp <= self.lasttime: tags = True break text = getMatch(data, '<div class="WB_text S_txt2">(*)</div>') if text: text = extractForHTML(text) else: text = '' r_mid = getMatch(data, 'mid=(*)&') r_uid = self.uid _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson)
uid = getMatch(data, 'usercard="id=(*)&') nickname = getMatch(data, 'nick-name="(*)"') mid = getMatch(data, 'pubuser_nick:(*)"') timestamp = getMatch(data, '<div class="WB_from S_txt2">.*?date="(*)"')[0:-3] if timestamp and timestamp.isdigit(): timestamp = long(timestamp) else: timestamp = 0 if timestamp <= self.lasttime: tags = True break text = getMatch(data, 'feed_list_content" >(*)</div>').strip() if text: text = extractForHTML(text.strip()) else: text = '' retweet = getMatch(data, 'forward_btn_text">.*?<em>(*)</em>').replace('转发', '') if retweet and retweet.isdigit(): retweet = long(retweet) else: retweet = 0 comment = getMatch(data, 'comment_btn_text">.*?<em>(*)</em>').replace('评论', '') if comment and comment.isdigit(): comment = long(comment) else: comment = 0