def groups(self): cr_url = 'http://weibo.com/p/100505%s/myfollow?pids=Pl_Official_RelationGroupList__96&relate=group' \ '&Pl_Official_RelationGroupList__96_page=1#Pl_Official_RelationGroupList__96' json_list = [] comment_url = cr_url % self.uid list_data = [] while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace('\\/', '/') break except Exception, e: #html = '' print "Network Exception!!! ", e continue #finally: datas = getMatchList(html, '<div class="mod_info">.*?</p>') # print len(datas) # r_datas = datas.reverse() list_data.append(datas) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') # print next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] else: break
def commentInbox(self): cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist' json_list = [] tags = False comment_url = cr_url while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') print html break except Exception, e: print "Network Exception!!! ", e time.sleep(5) continue #finally: datas = getMatchList( html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->') # print len(datas) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, 'page_frame" title="(*)"') mid = getMatch(data, '&cid=(*)&') timestamp = getMatch(data, '<div class="WB_from S_txt2">(*) 来自') if timestamp: timestamp = long(getTimeStamp(timestamp)) else: timestamp = 0 if timestamp <= self.mlasttime: tags = True break text = getMatch(data, '<div class="WB_text">(*)</div>') if text: text = extractForHTML(text) else: text = '' r_mid = getMatch(data, 'mid=(*)&') r_uid = self.uid #commet_type = 'make' commet_type = 'receive' _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_mid': r_mid, 'root_uid': r_uid, 'weibo_type': _type, 'comment_type': commet_type, 'update_time': self.update_time } wb_json = json.dumps(wb_item) json_list.append(wb_json) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') # print next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] elif not next_pageUrl or tags: break
'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_mid': r_mid, 'root_uid': r_uid, 'weibo_type': _type, 'update_time': self.update_time } wb_json = json.dumps(wb_item) json_list.append(wb_json) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') # print next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] elif not next_pageUrl or tags: break return json_list def execute(self): likes = self.likeInbox() executeES('weibo_feedback_like', 'text', likes) if __name__ == '__main__':
class FeedbackPrivate: def __init__(self, uid, current_ts, fans, follow, groups, lastTime): self.uid = uid self.follow = follow self.fans = fans self.groups = groups self.update_time = current_ts self.lasttime = lastTime self._headers = { "Headers": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2;" " .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0;" " .NET4.0C; .NET4.0E; InfoPath.3)", "Referer": "http://weibo.com/u/%s/home?topnav=1&wvr=6" % self.uid } def messages(self): cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1' de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d' json_list = [] tags = False comment_url = cr_url while True: print comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) response = urllib2.urlopen(request, timeout=60) html = response.read().decode('string_escape').replace( '\\/', '/') break except Exception, e: print "Network Exception!!! ", e continue #print 'html:', html #finally: datas = getMatchList( html, '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->' ) for data in datas: photo_url = "http:" + getMatch(data, '<img.*?src="(*)"') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') r_uid = self.uid counts = getMatch( data, '<em class="W_new_count S_spetxt_bg">(*)</em>') if counts and counts.isdigit(): counts = long(counts) else: counts = 0 _type = 'stranger' type1 = '' type2 = '' for fljson in self.follow: fjson = json.loads(fljson) if fjson['uid'] == uid: type1 = 'follow' break for fljson in self.fans: fjson = json.loads(fljson) if fjson['uid'] == uid: type2 = 'followed' break if type1 and type2: _type = 'friend' elif type1: _type = type1 elif type2: _type = type2 if uid == r_uid: _type = 'self' while True: try: detailUrl = de_url % (uid, int(time.time() * 1000)) #print 'detail_url:', detailUrl request = urllib2.Request(detailUrl) response = urllib2.urlopen(request, timeout=60) ms_content = json.loads(response.read()) break except Exception, e: print "Network Exception!!! ", e continue #else: html = ms_content["data"]["html"] ms_datas = getMatchList( html, u'(<!-- 单行文字-->|<div class="space">).*?<!--/附件信息-->') # print datas[0] last_time = 0 for ms_data in ms_datas: mid_uid = getMatch(ms_data, 'usercard="id=(*)"') mid = getMatch(ms_data, 'mid="(*)"') timestamp = getMatch( ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>') #soup = BeautifulSoup(ms_data) #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"]) if timestamp: timestamp = long(getTimeStamp(timestamp)) last_time = timestamp else: timestamp = last_time if timestamp < self.lasttime: tags = True print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime #break next text = getMatch(ms_data, u'<div class="cont">.*?<!--/附件信息-->') if text: text = extractForHTML(text) text = commentExtract(text) if mid_uid == uid: private_type = 'receive' elif mid_uid == r_uid: private_type = 'make' else: private_type = '' wb_item = { 'photo_url': photo_url, 'uid': uid, 'nick_name': nickname, 'mid': mid, 'timestamp': timestamp, 'text': text, 'root_uid': r_uid, 'weibo_type': _type, 'private_type': private_type, 'w_new_count': counts, 'update_time': self.update_time } wb_json = json.dumps(wb_item) #print 'wb_json:::',wb_json json_list.append(wb_json) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') # print next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] elif not next_pageUrl or tags: break
def follow(self): cr_url = 'http://weibo.com/p/100505%s/myfollow?t=1&pids=Pl_Official_RelationMyfollow__93' \ '&cfs=&Pl_Official_RelationMyfollow__93_page=1#Pl_Official_RelationMyfollow__93' json_list = [] comment_url = cr_url % self.uid list_data = [] while True: print "comment_url**comment_url**comment_url**comment_url**", comment_url while True: try: request = urllib2.Request(comment_url, headers=self._headers) print 1111111111111111 response = urllib2.urlopen(request, timeout=60) print 2222222222222222 html = response.read().decode('string_escape').replace('\\/', '/') print 3333333333333333 break except Exception, e: print "Network Exception!!! ", e continue #finally: datas = getMatchList(html, '<li class="member_li S_bg1".*?</li>') # print len(datas) # r_datas = datas.reverse() list_data.append(datas) # 分页 next_pageUrl = getUrlToPattern(html, comment_url, pattern='page', text_pattern='下一页') print "next_pageUrl**next_pageUrl**next_pageUrl**next_pageUrl**",next_pageUrl if next_pageUrl: comment_url = next_pageUrl[0] else: break r_list_data = reversed(list_data) for l_datas in r_list_data: r_datas = reversed(l_datas) for data in r_datas: #print 'data::',data photo_url = getMatch(data, 'profile_image_url=(*)&') uid = getMatch(data, 'usercard="id=(*)"') nickname = getMatch(data, '<img.*?alt="(*)"') timestamp = int(round(time.time())) time.sleep(1) sex = getMatch(data, '&sex=(*)"') if not sex: sex = '' elif sex == 'f': sex = 'female' elif sex == 'm': sex = 'male' follow_source = getMatch(data, 'class="S_link2" >(*)</a>') if not follow_source: follow_source = '' description = getMatch(data, 'W_autocut S_txt2">(*)</div>') if not description: description = '' gid = getMatch(data, '&gid=(*)&') if not gid: gid = '0' gname = getMatch(data, '&gname=(*)&') if not gname: gname = '' r_uid = self.uid _type = 'follow' #获得关注人的详细信息 #user = SinaOperateAPI().getUserShow(uid=uid) wb_item = { 'photo_url': photo_url, 'uid': uid, 'mid': uid, 'nick_name': nickname, 'timestamp': timestamp, 'sex': sex, 'description': description, 'follow_source': follow_source, 'gid': gid, 'gname': gname, 'root_uid': r_uid, 'weibo_type': _type, 'update_time': self.update_time } if wb_item['mid'] == None: wb_item['mid'] = '' print "follow, mid", wb_item['mid'] print "follow, root_uid", wb_item['root_uid'] wb_json = json.dumps(wb_item) # print wb_json json_list.append(wb_json)