def valid_cookie(self, html=''): html = str(html) if not html: url = 'http://weibo.cn/kaifulee' headers = self.get_headers(url) html = self.get_content_head(url, headers=headers) if not html: msg = 'Error in cookie: need relogin.' logger.info(msg) write_message(msg, self.window) self.clear_cookie(self.cookie_file) return False if u'登录' in html: if not self.login(): msg = 'In valid_cookie: relogin failed.' logger.info(msg) write_message(msg, self.window) self.clear_cookie(self.cookie_file) return False gsid = None for c in self.cj: if c.name.startswith('gsid') and c.domain == '.weibo.cn': gsid = c.value self.login_params = {'gsid': gsid, 'vt': '4', 'lret': '1'} return True
def _fetch_msg_repost(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html, num_pages tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 return None, None
def _fetch(self, url): html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None
def get_servertime(self): url = ('http://login.sina.com.cn/sso/prelogin.php?entry=account' '&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod' '&client=ssologin.js(v1.4.5)&_=%s' % self.get_milli_time()) headers = self.get_headers(url) headers['Accept'] = '*/*' headers['Referer'] = 'http://weibo.com/' del headers['Accept-encoding'] result = {} req = self.pack_request(url, headers) for _ in range(3): data = None try: with contextlib.closing(urllib2.urlopen(req)) as resp: data = resp.read() p = re.compile('\((.*)\)') json_data = p.search(data).group(1) data = json.loads(json_data) result['servertime'] = str(data['servertime']) result['nonce'] = data['nonce'] result['rsakv'] = str(data['rsakv']) result['pubkey'] = str(data['pubkey']) self.pcid = str(data['pcid']) break except Exception, e: msg = 'Get severtime error. %s' %str(e) logger.info(msg) write_message(msg, self.window)
def get_login_form(self): url = 'http://3g.sina.com.cn/prog/wapsite/sso/login.php?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt=' headers = self.get_headers(url) headers['Accept'] = '*/*' headers['Referer']= 'http://weibo.cn' del headers['Accept-encoding'] req = self.pack_request(url, headers) rand = None passwd_s = None vk = None for _ in range(3): try: data = None with contextlib.closing(urllib2.urlopen(req)) as resp: data = resp.read() rand = HTML.fromstring(data).xpath('//form/@action')[0] passwd_s = HTML.fromstring(data).xpath("//input[@type='password']/@name")[0] vk = HTML.fromstring(data).xpath("//input[@name='vk']/@value")[0] return rand, passwd_s, vk except Exception, e: msg = 'get login form error: %s' %str(e) logger.info(msg) write_message(msg, self.window) pass
def check_cookie(self, user=None, pwd=None, soft_path=None): if user is None or pwd is None: user = self.username pwd = self.password assert(user is not None and pwd is not None) if soft_path is None: soft_path = self.soft_path login_ok = True self.cookie_file = os.path.join(soft_path, settings.COMWEIBO_COOKIE) if os.path.exists(self.cookie_file): msg = 'cookie exist.' write_message(msg) if 'Set-Cookie' not in open(self.cookie_file, 'r').read(): msg = 'but does not contain a valid cookie.' write_message(msg) login_ok = self.login(user, pwd) else: login_ok = self.login(user, pwd) if login_ok: return self.valid_cookie() else: return False
def save_verify_code(self, url): try: cookie_str = '' for cookie in self.cj.as_lwp_str(True, True).split('\n'): cookie = cookie.split(';')[0] cookie = cookie.replace('\"', '').replace('Set-Cookie3: ', ' ').strip()+';' cookie_str += cookie headers = self.get_headers(url) headers['Accept'] = 'image/png,image/*;q=0.8,*/*;q=0.5' headers['Referer'] = 'http://weibo.com/' headers['Cookie'] = cookie_str del headers['Accept-encoding'] req = self.pack_request(url, headers) content = self.urlopen_read(req) f = open(os.path.join(self.soft_path, 'pin.png'), 'wb') f.write(content) f.flush() f.close() except Exception, e: msg = 'Save verify code error. %s' %str(e) logger.info(msg) write_message(msg, self.window) return
def _fetch_msg_comment(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html, num_pages tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 return None, None
def fetch_msg_comments(self, msg_id, page=1): url = 'http://weibo.com/aj/comment/big?_wv=5' headers = self.get_headers(url) headers['Accept'] = '*/*' headers['Referer'] = 'http://weibo.com/' del headers['Accept-encoding'] body = { '__rnd' : str(int(time.time() * 1000)), '_t' : '0', 'id' : msg_id, 'page' : page } url = url + urllib.urlencode(body) req = self.pack_request(url, headers) page= self.urlopen_read(req) try: if json.loads(page)['code'] == '100000': data = json.loads(page)['data'] html = data['html'] pg = int(data['page']['totalpage']) return html, pg else: msg = json.loads(page)['msg'] logger.info(msg) write_message(msg, self.window) return None, None except ValueError: return page, None
def _fetch(self, url): html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None
def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/fans?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FANS) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0
def _crawl(parser, uid, page, num_pages=""): msg = "Crawl user(%s)'s weibos-page: %s:%s" % (self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0
def _crawl(parser, uid, page, num_pages=''): msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0
def _crawl(parser, uid, page, num_pages=""): msg = "Crawl user(%s)'s follows-page: %s:%s" % (self.uid, num_pages, page) write_message(msg, self.window) url = "http://weibo.com/%s/follow?page=%s" % (uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0
def _crawl(parser, msg_id, page, num_pages=''): msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages
def _crawl(parser, msg_id, page, num_pages=""): msg = "Crawl message(%s)'s reposts-page:%s:%s" % (self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages
def do_login(self, login_user, login_pwd, door=''): login_ok = False try: username = login_user pwd = login_pwd url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)' postdata = { 'entry': 'weibo', 'gateway': '1', 'from': '', 'savestate': '7', 'userticket': '1', 'pagerefer': '', 'ssosimplelogin': '******', 'vsnf': '1', 'vsnval': '', 'service': 'miniblog', 'pwencode': 'rsa2', 'rsakv': self.rsakv, 'encoding': 'utf-8', 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype': 'META', 'prelt': '26', } postdata['servertime'] = self.servertime postdata['nonce'] = self.nonce postdata['su'] = self.get_user(username) postdata['sp'] = self.get_pwd(pwd, self.servertime, self.nonce).lower() #当需要验证码登录的时候 if door: postdata['pcid'] = self.pcid postdata['door'] = door.lower() headers = self.get_headers(url) headers['Referer'] = 'http://weibo.com/' req = self.pack_request(url, headers, postdata) text = self.urlopen_read(req) return text except Exception, e: msg = 'Error in do_login. %s' % str(e) logger.info(msg) write_message(msg, self.window)
def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None
def crawl_follows(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/follow?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage) num_pages = _crawl(parser, self.uid, page=1) if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' %(uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None
def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None
def do_login(self, login_user, login_pwd, door=''): login_ok = False try: username = login_user pwd = login_pwd url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)' postdata = { 'entry' : 'weibo', 'gateway' : '1', 'from' : '', 'savestate' : '7', 'userticket' : '1', 'pagerefer' : '', 'ssosimplelogin': '******', 'vsnf' : '1', 'vsnval' : '', 'service' : 'miniblog', 'pwencode' : 'rsa2', 'rsakv' : self.rsakv, 'encoding' : 'utf-8', 'url' : 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype' : 'META', 'prelt' : '26', } postdata['servertime'] = self.servertime postdata['nonce'] = self.nonce postdata['su'] = self.get_user(username) postdata['sp'] = self.get_pwd(pwd, self.servertime, self.nonce).lower() #当需要验证码登录的时候 if door: postdata['pcid'] = self.pcid postdata['door'] = door.lower() headers = self.get_headers(url) headers['Referer'] = 'http://weibo.com/' req = self.pack_request(url, headers, postdata) text = self.urlopen_read(req) return text except Exception, e: msg = 'Error in do_login. %s' %str(e) logger.info(msg) write_message(msg, self.window)
def _check_page_right(self, html): ''' check whether the page is got before login or after. ''' if html is None: return False if len(html) == 0: msg = u'weibo改版了,信息标签发生变化' logger.info(msg) write_message(msg, self.window) return None return not (u'<title>' in html)
def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None
def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None
def login(self, login_user=None, login_pwd=None): if self.username is None or self.password is None: self.username = login_user self.password = login_pwd assert self.username is not None and self.password is not None rand, passwd_s, vk = self.get_login_form() postdata = { 'mobile' : self.username, passwd_s : self.password, 'remember' : 'on', 'backURL' : 'http://weibo.cn/', 'backTitle': '新浪微博', 'vk' : vk, 'submit' : '登录', 'encoding' : 'utf-8' } url = 'http://3g.sina.com.cn/prog/wapsite/sso/' + rand headers = self.get_headers(url) req = self.pack_request(url, headers, postdata) page = self.urlopen_read(req) link = HTML.fromstring(page).xpath("//a/@href")[0] if not link.startswith('http://'): link = 'http://weibo.cn/%s' % link headers = self.get_headers(link) req = self.pack_request(link, headers) self.urlopen_read(req) link = urldecode(link) try: self.login_params = urldecode(link['u']) self.cj.save(self.cookie_file, True, True) msg = 'weibo.cn: login succeed.' write_message(msg) return True except KeyError: msg = 'Login failed: it may caused by the wrong username/password.\nPlease check.' logger.info(msg) write_message(msg, self.window) return False
def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' % (self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None
def check_user(self, uid): url = 'http://weibo.cn/u/%s' %(uid) headers = self.get_headers(url) headers['Accept'] = '*/*' headers['Referer']= 'http://weibo.cn/' tries = 10 for _ in range(tries): try: if self.login_params is None: if not self.check_cookie(): continue params = urldecode(url) params.update(self.login_params) url = '%s?%s' % (url.split('?', 1)[0], urllib.urlencode(params)) req = self.pack_request(url, headers) self.n_connections += 1 page = None with contextlib.closing(urllib2.urlopen(req)) as resp: if resp.info().get('Content-Encoding') == 'gzip': page = self.gzip_data(resp.read()) else: page = resp.read() #not login if u'登录' in page: if not self.check_cookie(): msg = 'Error in check user: login failed.' write_message(msg, self.window) return None return not (u'用户不存在' in page or 'User does not exists' in page or u'抱歉,您当前访问的用户状态异常,暂时无法访问。' in page) except Exception, e: msg = 'Error in check_user: exit Exception. %s' %str(e) logger.info(msg) write_message(msg, self.window) return None
def fetch(self, url, query): headers = self.get_headers(url) headers['Accept'] = '*/*' headers['Referer'] = 'http://weibo.com/' req = self.pack_request(url, headers) page= self.urlopen_read(req) if "$CONFIG['allowConnect'] = 'false'" in page or "$CONFIG['allowConnect']='false'" in page: msg = u'访问频繁,被新浪暂封了.' logger.info(msg) write_message(msg, self.window) return None doc = self.extract_content(page, query) return doc
def _get_first_part(headers, body, url): body['__rnd'] = str(int(time.time() * 1000)) body['pre_page'] = body['page'] - 1 url = url + urllib.urlencode(body) req = self.pack_request(url, headers) page= self.urlopen_read(req) try: if json.loads(page)['code'] == "100000": return json.loads(page)['data'] else: msg = json.loads(page)['msg'] logger.info(msg) write_message(msg, self.window) return None except ValueError: return page
def redo_login(self, login_url): login_ok = False try: headers = self.get_headers(login_url) headers['Referer'] = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)' req = self.pack_request(login_url, headers) if self.urlopen_read(req) is not None: self.cj.save(self.cookie_file, True, True) msg = 'login success' write_message(msg) login_ok = True except Exception, e: msg = 'Error in redo_login. %s' %str(e) logger.info(msg) write_message(msg, self.window)
def urlopen_read(self, req): tries = 10 for i in range(tries): try: self.n_connections += 1 page = None with contextlib.closing(urllib2.urlopen(req)) as resp: if resp.info().get('Content-Encoding') == 'gzip': page = self.gzip_data(resp.read()) else: page = resp.read() return page except Exception, e: if e.code == 404: msg = 'Error in urlopen_read: %s.' %str(e) write_message(msg, self.window) return None if i < tries - 1: sec = (i + 1) * 5 msg = ('Error in urlopen_read: %s\nTake a rest: %s seconds, and retry.' %(str(e), sec)) write_message(msg, self.window) time.sleep(sec) else: msg = 'Exit incorrect. %s' %str(e) logger.info(msg) write_message(msg, self.window) return None
def valid_cookie(self, html=''): html = str(html) if not html: url = 'http://weibo.com/kaifulee' headers = self.get_headers(url) html = self.get_content_head(url, headers=headers) if not html: msg = 'need relogin.' logger.info(msg) write_message(msg, self.window) self.clear_cookie(self.cookie_file) return False html = str(html) html = html.replace('"', "'") if 'sinaSSOController' in html: p = re.compile('location\.replace\(\'(.*?)\'\)') try: login_url = p.search(html).group(1) headers = self.get_headers(login_url) req = self.pack_request(url=login_url, headers=headers) html = self.urlopen_read(req) self.cj.save(self.cookie_file, True, True) except Exception, e: msg = 'relogin failed. %s' %str(e) logger.info(msg) write_message(msg, self.window) self.clear_cookie(self.cookie_file) return False
def _fetch_msg_comment(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page) page_right = self._check_page_right(html) tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message("_fetch trying: %s, sleep: %s seconds" % (tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 else: return html, num_pages self.error = True
def _fetch_weibo(self, uid, page): html = self.fetcher.fetch_weibo(uid, page) page_right = self._check_page_right(html) tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch_weibo(uid, page) page_right = self._check_page_right(html) if page_right: return html tries += 1 else: return html self.error = True
def check_user(self, uid): is_exist = False url = 'http://weibo.com/u/%s' % (uid) headers = self.get_headers(url) headers['Accept'] = '*/*' headers['Referer'] = 'http://weibo.com/' req = self.pack_request(url, headers) tries = 10 for i in range(tries): try: self.n_connections += 1 page = None with contextlib.closing(urllib2.urlopen(req)) as resp: if resp.info().get('Content-Encoding') == 'gzip': page = self.gzip_data(resp.read()) else: page = resp.read() if "$CONFIG['islogin'] = '******'" in page or "$CONFIG['islogin']='******'" in page: is_exist = not (u'错误提示 新浪微博' in page) return is_exist else: if not self.check_cookie(): msg = 'Error in check_user: login failed' logger.info(msg) write_message(msg, self.window) return None except urllib2.HTTPError, e: if e.code == 302 and e.geturl is not None: is_exist = True else: is_exist = False return is_exist except urllib2.URLError, e: if isinstance(e.reason, socket.timeout) and (i < tries - 1): sec = (i + 1) * 5 msg = ( 'Error in check_user:timeout. Retry: (%s-%s)-sleep %s seconds' % (tries, i, sec)) write_message(msg, self.window) time.sleep(sec) else: msg = 'Error in check_user: retry timeout. %s' % str(e) logger.info(msg) write_message(msg, self.window) return None
def crawl_weibos(self): def _crawl(parser, uid, page, num_pages=""): msg = "Crawl user(%s)'s weibos-page: %s:%s" % (self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = "Checking: whether user(%s) exists or not..." % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = "Not exist: %s." % self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage) num_pages = _crawl(parser, self.uid, page=1) pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = "Crawl user(%s)'s weibos: total page=%s," " cost time=%s sec, connections=%s" % ( self.uid, num_pages, cost_time, self.fetcher.n_connections, ) logger.info(msg) write_message(msg, self.window)
def crawl_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage) num_pages = _crawl(parser, self.uid, page=1) pages = [i for i in xrange(2, num_pages+1)] """ if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() """ cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages=''): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s reposts: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def crawl_infos(self): msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return msg = 'Crawl user(%s)\'s profile' %self.uid logger.info(msg) write_message(msg, self.window) self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path) start_time = time.time() url = 'http://weibo.com/%s/info' % self.uid parser = ComInfosParser(self.uid, self.storage) html = self._fetch(url, query=settings.QUERY_INFO) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' %(self.uid, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def check_new_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage) num_pages = _crawl(parser, self.uid, page=1) cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)