def get_follows(uid, n_follows, followee_or_follower): ''' get a weibo-user's followees or followers `uid`: uid of this user `n_follows`: number of follows to get `followee_or_follower`: 1-followee, 2-follower return: a list of Follow ''' if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE): follows = [] n_follows = 200 if n_follows > 200 or n_follows is None else n_follows #at most 200 n_pages = int(n_follows / 20) + 1 #20 follows per page n_pages = n_follows/20 if n_follows%20==0 else (n_follows/20+1) for page in range(n_pages): for o_o in range(MAXTIMES2TRY): try: if followee_or_follower == 1: url = 'http://weibo.com/' + uid + '/follow?page=' + str(page+1) else: url = 'http://weibo.com/' + uid + '/follow?relate=fans&page=' + str(page+1) html = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: follows_on_current_page = parse_follow(html) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) try: os.remove(COOKIE_FILE) except: pass if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE) == 0: log.error("Login fail!") else: follows += follows_on_current_page if followee_or_follower == 1: log.info("Followees fetched. - uid: %s - count: %d - page:%d" % (uid, len(follows_on_current_page), page+1)) else: log.info("Followers fetched. - uid: %s - count: %d - page:%d" % (uid, len(follows_on_current_page), page+1)) break time.sleep(randint(1, MAXSLEEPINGTIME)) return follows else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass
def scrapy(self): login_status = login(self.login_username, self.login_password, self.cookies_file) if login_status: if self.start_uid: task_queue.put(self.start_uid) elif self.uids_file: uids_list = self.__load_uids__() for uid in uids_list: task_queue.put(uid) else: #start uid or uids file is needed raise Exception('ERROR: Start uid or uids file is needed.') #spawn a pool of threads, and pass them queue instance for _ in range(self.thread_number): st = scrapy_threading(self.scrapy_do_task, self.wanted) st.setDaemon(True) st.start() task_queue.join()
def test_mblog(): #ff = open('test/mblog2.html', 'r') #html = ff.read() #ff.close() from weibo_login import login from conf import USERNAME, PASSWORD, COOKIE_FILE from urllib2 import urlopen html = '' if login(USERNAME, PASSWORD, COOKIE_FILE): print 'Login!' from opener import urlfetch html = urlfetch("http://weibo.com/2584784292/weibo") #html = urlfetch("http://weibo.com/2803301701/mblog") #print html mblogs = parse_mblog(html, '2584784292') for m in mblogs: print 'uid: ', m.uid print 'mid: ', m.mid print 'content: ' , m.content print 'time: ', m.created_time print 'n_likes: ', m.n_likes print 'n_forward: ', m.n_forwards print 'n_comments: ', m.n_comments if m.geo: print 'longitude: ', m.geo.longitude print 'latitude: ', m.geo.latitude print 'location: ', m.geo.location print m.is_forward if m.is_forward: print 'ouid: ', m.ori_mblog.uid print 'omid: ', m.ori_mblog.mid print 'ocontent: ', m.ori_mblog.content print '======================================'
def get_uid_by_uickname(nickname): if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE): for o_o in range(MAXTIMES2TRY): try: url = 'http://weibo.com/n/' +nickname html = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: if 'noresult_tit' in html: return "This username does not match a weibo user" uid_pattern = re.compile('oid\'.+?\'(\d+)\'') uid = uid_pattern.findall(html)[0] return uid except: pass else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass
def get_info(uid, weibo_user_type=1001): ''' get a weibo-user's information `uid`: uid of this user `weibo_user_type`: 1001 stands for normal users while 1002 stands for media users return: UserInfo ''' if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE): for o_o in range(MAXTIMES2TRY): try: url = 'http://weibo.com/' + uid + '/info' html = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: info = parse_user_profile(html, weibo_user_type) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) else: return info else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass
def weibo_spider(self, url): ''' 微博热点话题爬虫 ''' username = '******' password = '******' session = login(username, password) url = "{url}&retcode=6102".format(url=url) r = session.get(url) if r.status_code == 200: p = '<strong class=\\\\"W_f1[0-9]\\\\">(.*?)<\\\/strong>' m = re.findall(p, r.text) if m: count = m[0] return self.deal_unit(count)
def test_info(): from weibo_login import login from conf import USERNAME, PASSWORD, COOKIE_FILE from opener import urlfetch if login(USERNAME, PASSWORD, COOKIE_FILE) is False: print "Login failed!" return html = urlfetch("http://weibo.com/1789809794/info") user_info = parse_user_profile(html) print user_info.nickname, user_info.location, user_info.sex, user_info.birth, \ user_info.blog, user_info.site, user_info.intro, user_info.email, user_info.qq, user_info.msn print "n_followees: ", user_info.n_followees print "n_followers: ", user_info.n_followers print "n_mblogs: ", user_info.n_mblogs print "domain: ", user_info.domain for edu in user_info.edu: print edu.school, edu.time, edu.detail for work in user_info.work: print work.company, work.time, work.department_or_position, work.location for tag in user_info.tags: print tag
def test_info(): from weibo_login import login from conf import USERNAME, PASSWORD, COOKIE_FILE from opener import urlfetch if login(USERNAME, PASSWORD, COOKIE_FILE) is False: print "Login failed!" return #html = urlfetch("http://weibo.com/1789809794/info") html = urlfetch("http://weibo.com/1618051664/info") user_info = parse_user_profile(html, 1002) print user_info.nickname, user_info.location, user_info.sex, user_info.birth, \ user_info.blog, user_info.site, user_info.intro, user_info.email, user_info.qq, user_info.msn print "n_followees: ", user_info.n_followees print "n_followers: ", user_info.n_followers print "n_mblogs: ", user_info.n_mblogs print "domain: ", user_info.domain for edu in user_info.edu: print edu.school, edu.time, edu.detail for work in user_info.work: print work.company, work.time, work.department_or_position, work.location for tag in user_info.tags: print tag
def process(username, pwd): cookie_file = 'cookie/weibo_login_cookies.' + username + '.dat' if weibo_login.login(username, pwd, cookie_file): print 'Login WEIBO succeeded' #if you see the above message, then do whatever you want with urllib2, following is a example for fetch Kaifu's Weibo Home Page #Trying to fetch Kaifu Lee's Weibo home page time.sleep(10) home_page = urllib2.urlopen('http://www.weibo.com').read() #print home_page time.sleep(10) mid = weibo_zan.get_mid() print mid if mid != '': likeResponse = weibo_zan.like_mid(mid) print likeResponse else: print 'Login WEIBO failed' time.sleep(10)
def get_info(uid,proxy_ip=''): ''' get a weibo-user's information `uid`: uid of this user `weibo_user_type`: 1001 stands for normal users while 1002 stands for media users return: UserInfo ''' if login(USERNAME, PASSWORD, COOKIE_FILE): for o_o in range(MAXTIMES2TRY): try: url = 'http://weibo.com/' + uid + '/info' if proxy_ip =='': html = urlfetch(url) else: proxies = {"http":"http://" + proxy_ip} cookies = get_cookie(COOKIE_FILE) r = requests.get(url, cookies=cookies, headers=http_headers, proxies=proxies,timeout=7) html = r.content with open('data/'+str(random.randint(1,100))+'.html','a') as f: f.write(html) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: info,user_type = parse_user_profile(html) except UnsuspectedPageStructError: error_logger.error("Unsuspected page structure! - url: %s" % url) else: return info,user_type else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass
def login(): log.msg("login... " , level=log.INFO) username = '******' pwd = 'hdlhdl' cookie_file = 'weibo_login_cookies.dat' return weibo_login.login(username, pwd, cookie_file)
def get_mblogs_by_time(uid,start,end): mblog_count = 0 if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE): info = get_info(uid) domain = info.domain url = 'http://weibo.com/%s/weibo?is_ori=1&is_forward=1&key_word=&start_time=%s&end_time=%s&is_search=1' % (uid,start,end) html = urlfetch(url) pattern = re.compile('S_spetxt.+?\>(\d+)\<') n_mblogs = int(pattern.findall(html)[0]) print uid,start,n_mblogs n_pages = int(n_mblogs / 45) #45 mblogs per page page = 1 #begin from first page last_update_time = datetime.datetime(1,1,1) while page <= n_pages+1: count_on_this_page = 0 base_url = 'http://weibo.com/%s/weibo?is_ori=1&is_forward=1&key_word=&start_time=%s&end_time=%s&page=%s&is_search=1' % (uid,start,end,page) mblogs = None for o_o in range(MAXTIMES2TRY): try: html = urlfetch(base_url) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: mblogs = parse_mblog(html, uid) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) break if mblogs is not None and len(mblogs) > 0: if page == 1: last_update_time = mblogs[0].created_time for mblog in mblogs: if len(storage.MicroBlog.objects(mid=mblog.mid)) < 1: mblog.save() count_on_this_page += 1 #load ajax data params = dict() params['max_id'] = mblogs[-1].mid params['end_id'] = mblogs[0].mid params['page'] = str(page) params['pre_page'] = str(page) params['count'] = str(15) params['feed_type'] = 0 params['__rnd'] = str(int(time.time()*1000)) params['id'] = domain + uid if domain is not None and uid is not None else None params['pagebar'] = '0' params['domain'] = domain params['script_uri'] = r'/' + uid + r'/weibo' url = 'http://weibo.com/p/aj/mblog/mbloglist?is_ori=1&is_forward=1&key_word=&start_time=%s&end_time=%s&is_search=1&' % (start,end) url = url + urllib.urlencode(params) #ajax_resp = urlfetch('http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params)) ajax_mblogs = None try: ajax_resp = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) else: try: ajax_mblogs = parse_json(ajax_resp, uid) except JsonDataParsingError: log.error("No json data to be loaded! - url: %s" % url) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) if ajax_mblogs is not None and len(ajax_mblogs) > 0: params['max_id'] = ajax_mblogs[-1].mid params['__rnd'] = str(int(time.time()*1000)) params['pagebar'] = '1' url = 'http://weibo.com/p/aj/mblog/mbloglist?is_ori=1&is_forward=1&key_word=&start_time=%s&end_time=%s&is_search=1&' % (start,end) url = url + urllib.urlencode(params) ajax_resp = urlfetch(url) try: ajax_mblogs += parse_json(ajax_resp, uid) except JsonDataParsingError: log.error("No json data to be loaded! - url: %s" % url) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) for mblog in ajax_mblogs: if len(storage.MicroBlog.objects(mid=mblog.mid)) < 1: mblog.save() count_on_this_page += 1 log.info("MicroBlogs fetched - uid: %s - page: %d - count: %d" % (uid, page, count_on_this_page)) else: #page fetch fail try: os.remove(COOKIE_FILE) except: pass if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE) == 0: log.error("Login fail!") page += 1 mblog_count += count_on_this_page time.sleep(randint(1, MAXSLEEPINGTIME)) else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass return mblog_count,last_update_time
def add_crawl(uid,weibo_user_type=1001): event_logger.info("Add crawl start uid: %s" % uid) last_update_time = storage.WeiboUser.objects(uid=uid)[0].last_update_time if last_update_time is None: last_update_time = datetime.datetime(1,1,1) info = get_info(uid) mblog_count = 0 new_mblog_count = 0 n_mblogs = info.n_mblogs domain = info.domain if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE): if n_mblogs is None: n_mblogs = 0 n_pages = int(n_mblogs / 45) #45 mblogs per page page = 1 #begin from first page new_last_update_time = datetime.datetime(1,1,1) while page <= n_pages+1: count_on_this_page = 0 url = '' if weibo_user_type == 1001: url = 'http://weibo.com/' + uid + '/weibo?page=' + str(page) elif weibo_user_type == 1002: url = 'http://weibo.com/' + uid + '/mblog?page=' + str(page) mblogs = None for o_o in range(MAXTIMES2TRY): try: html = urlfetch(url) #with open('data/'+ uid + '.html' ,'w') as f: # f.write(html) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: mblogs = parse_mblog(html, uid) if page == 1: mblogs = sorted(mblogs,key=create_time,reverse=True) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) break if mblogs is not None and len(mblogs) > 0: if page == 1: new_last_update_time = mblogs[0].created_time for mblog in mblogs: if mblog.created_time <= last_update_time: if new_last_update_time > last_update_time: storage.WeiboUser.objects(uid=uid).update(set__last_update_time=new_last_update_time,set__info=info) log.info("MicroBlogs fetched - uid: %s - new microblog count: %d" % (uid, new_mblog_count)) return if len(storage.MicroBlog.objects(mid=mblog.mid)) < 1: mblog.save() new_mblog_count += 1 count_on_this_page += 1 #load ajax data params = dict() params['max_id'] = mblogs[-1].mid params['end_id'] = mblogs[0].mid params['page'] = str(page) params['pre_page'] = str(page) params['count'] = str(15) params['feed_type'] = 0 params['__rnd'] = str(int(time.time()*1000)) params['id'] = domain + uid if domain is not None and uid is not None else None params['pagebar'] = '0' params['domain'] = domain params['script_uri'] = r'/' + uid + r'/weibo' url = 'http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params) #ajax_resp = urlfetch('http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params)) ajax_mblogs = None try: ajax_resp = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) else: try: ajax_mblogs = parse_json(ajax_resp, uid) except JsonDataParsingError: log.error("No json data to be loaded! - url: %s" % url) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) if ajax_mblogs is not None and len(ajax_mblogs) > 0: params['max_id'] = ajax_mblogs[-1].mid params['__rnd'] = str(int(time.time()*1000)) params['pagebar'] = '1' url = 'http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params) ajax_resp = urlfetch(url) try: ajax_mblogs += parse_json(ajax_resp, uid) except JsonDataParsingError: log.error("No json data to be loaded! - url: %s" % url) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) for mblog in ajax_mblogs: if mblog.created_time <= last_update_time: log.info("MicroBlogs fetched - uid: %s - new microblog count: %d" % (uid, new_mblog_count)) return if len(storage.MicroBlog.objects(mid=mblog.mid)) < 1: if new_last_update_time > last_update_time: storage.WeiboUser.objects(uid=uid).update(set__last_update_time=new_last_update_time,set__info=info) mblog.save() new_mblog_count+=1 count_on_this_page += 1 log.info("MicroBlogs fetched - uid: %s - page: %d - count: %d" % (uid, page, count_on_this_page)) else: #page fetch fail try: os.remove(COOKIE_FILE) except: pass if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE) == 0: log.error("Login fail!") page += 1 mblog_count += count_on_this_page time.sleep(randint(1, MAXSLEEPINGTIME)) else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass if new_last_update_time > last_update_time: storage.WeiboUser.objects(uid=uid).update(set__last_update_time=new_last_update_time)
def get_mblogs(uid, n_mblogs, domain = None, weibo_user_type=1001,limit=100): ''' get micro-blogs of a weibo-user and save to mongodb `uid`: uid of this user `n_mblogs`: number of mblogs to get `domain`: domain of this user `weibo_user_type`: 1001 stands for normal users while 1002 stands for media users return: the number of mblogs actually got ''' mblog_count = 0 if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE): if n_mblogs is None: n_mblogs = 0 n_pages = int(n_mblogs / 45) #45 mblogs per page # n_pages = n_pages if n_pages<limit else limit page = 1 #begin from first page last_update_time = datetime.datetime(1,1,1) while page <= n_pages+1: count_on_this_page = 0 url = '' if weibo_user_type == 1001: url = 'http://weibo.com/' + uid + '/weibo?page=' + str(page) elif weibo_user_type == 1002: url = 'http://weibo.com/' + uid + '/mblog?page=' + str(page) mblogs = None for o_o in range(MAXTIMES2TRY): try: html = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: mblogs = parse_mblog(html, uid) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) else: break if mblogs is not None and len(mblogs) > 0: if page == 1: last_update_time = mblogs[0].created_time for mblog in mblogs: if len(storage.MicroBlog.objects(mid=mblog.mid)) < 1: mblog.save() count_on_this_page += 1 #load ajax data params = dict() params['max_id'] = mblogs[-1].mid params['end_id'] = mblogs[0].mid params['page'] = str(page) params['pre_page'] = str(page) params['count'] = str(15) params['feed_type'] = 0 params['__rnd'] = str(int(time.time()*1000)) params['id'] = domain + uid if domain is not None and uid is not None else None params['pagebar'] = '0' params['domain'] = domain params['script_uri'] = r'/' + uid + r'/weibo' url = 'http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params) #ajax_resp = urlfetch('http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params)) ajax_mblogs = None for o_O in range(MAXTIMES2TRY): try: ajax_resp = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) else: try: ajax_mblogs = parse_json(ajax_resp, uid) except JsonDataParsingError: log.error("No json data to be loaded! - url: %s" % url) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) else: break if ajax_mblogs is not None and len(ajax_mblogs) > 0: params['max_id'] = ajax_mblogs[-1].mid params['__rnd'] = str(int(time.time()*1000)) params['pagebar'] = '1' url = 'http://weibo.com/p/aj/mblog/mbloglist?' + urllib.urlencode(params) for o_O in range(MAXTIMES2TRY): try: ajax_resp = urlfetch(url) except URLError: log.error("URLError! - url: %s" % url) else: try: ajax_mblogs += parse_json(ajax_resp, uid) except JsonDataParsingError: log.error("No json data to be loaded! - url: %s" % url) except UnsuspectedPageStructError: log.error("Unsuspected page structure! - url: %s" % url) else: break for mblog in ajax_mblogs: if len(storage.MicroBlog.objects(mid=mblog.mid)) < 1: mblog.save() count_on_this_page += 1 log.info("MicroBlogs fetched - uid: %s - page: %d - count: %d" % (uid, page, count_on_this_page)) else: #page fetch fail try: os.remove(COOKIE_FILE) except: pass if not os.path.exists(COOKIE_FILE): time.sleep(random.uniform(5,25)) if login(USERNAME, PASSWORD, COOKIE_FILE) == 0: log.error("Login fail!") page += 1 mblog_count += count_on_this_page time.sleep(randint(1, MAXSLEEPINGTIME)) else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass print last_update_time return mblog_count,last_update_time
def get_follows(uid,proxy_ip=''): event_logger.info("get follows start uid: %s" % uid) info,user_type = get_info(uid,proxy_ip) if info: event_logger.info("Information fetching succeed - uid: %s" % uid) else: event_logger.critical("Infromation fetching fail - uid: %s" % uid) n_follows = info.n_followees if login(USERNAME, PASSWORD, COOKIE_FILE): follows = [] n_follows = 200 if n_follows > 200 or n_follows is None else n_follows #at most 200 if n_follows%10 == 0: n_pages = n_follows/20 else: n_pages = n_follows/20 for page in range(n_pages): for o_o in range(MAXTIMES2TRY): try: url = 'http://weibo.com/' + uid + '/follow?page=' + str(page+1) if proxy_ip =='': html = urlfetch(url) else: proxies = {"http":"http://" + proxy_ip} cookies = get_cookie(COOKIE_FILE) r = requests.get(url, cookies=cookies, headers=http_headers, proxies=proxies,timeout=7) html = r.content except URLError: log.error("URLError! - url: %s" % url) time.sleep(randint(1, MAXSLEEPINGTIME)) continue else: try: follows_on_current_page = parse_follow(html) except UnsuspectedPageStructError: error_logger.error("Unsuspected page structure! - url: %s" % url) try: os.remove(COOKIE_FILE) except: pass if login(USERNAME, PASSWORD, COOKIE_FILE) == 0: log.error("Login fail!") else: follows += follows_on_current_page log.info("Followees fetched. - uid: %s - count: %d - page:%d" % (uid, len(follows_on_current_page), page+1)) break time.sleep(randint(1, MAXSLEEPINGTIME)) if page == 2: if len(follows) == 0: raise CountException user, create = storage.WeiboUser.objects.get_or_create(uid=uid) user.info = info user.followees = follows user.save() else: log.error("Login fail!") try: os.remove(COOKIE_FILE) except: pass