def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.user_agent = cfg.http_agent self.max_agent = cfg.max_agent self.agent_pool = TimedAgentPool() self.last_clear = 0 self.ready = True
def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.userAgent = cfg.http_agent self.max_agent = cfg.max_agent self.agentPool = TimedAgentPool(self.interval_min, self.interval_max, self.login_interval) self.token = self.get_token() self.last_clear = 0 self.ready = True
def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.userAgent = cfg.http_agent self.interval_min = cfg.http_interval_min self.interval_max = cfg.http_interval_max self.avg_interval = (self.interval_min + self.interval_max) / 2.0 self.login_interval = cfg.login_interval self.max_agent = cfg.max_agent self.agentPool = TimedAgentPool( self.interval_min, self.interval_max, self.login_interval, ) self.last_clear = 0 self.count = int((3600.0*2)/self.avg_interval) self.ready = False
class NodeService(ClientServiceBase): ''' ''' servicename = 'observer.sina.users.user_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.userAgent = cfg.http_agent self.max_agent = cfg.max_agent self.agentPool = TimedAgentPool(self.interval_min, self.interval_max, self.login_interval) self.token = self.get_token() self.last_clear = 0 self.ready = True def get_token(self): ''' 获取一个可用的token''' url = 'http://insight.bestminr.com/get_token' return json.loads(urllib.urlopen(url).read()).get('access_token') def addAgent(self, seq): ''' 添加一个新的agent到agentPool ''' agent = getAgent(self.proxy, self.userAgent) agent.remove = False agent.seq = seq self.agentPool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: seq = 0 while len(self.agentPool.agents) < self.max_agent: seq += 1 self.addAgent(seq) yield wait(10.) @inlineCallbacks def searchLoop(self, agent): ''' ''' needbreak = False while 1: if agent.remove: self.agentPool.removeAgent(agent) break reqid, uid = yield self.callController('nextRequest', 'extract') log.info('Got uid %s from server' % uid) try: result = yield self.search(agent, uid) except InfiniteLoginError: log.exception() yield self.callController("fail", uid=uid) result = None needbreak = True except: log.exception() result = None self.callController('sendResult', reqid, uid, result) if needbreak: break @inlineCallbacks def getContent(self, agent, uid, token): ''' ''' url = 'https://api.weibo.com/2/friendships/friends/ids.json?uid=%s&access_token=%s' % (uid, token) result = yield request(agent, url) returnValue(result) @inlineCallbacks def search(self, agent, uid, token): ''' ''' data = yield self.getContent(agent, uid, token) if data is None: log.debug("Got Something Wrong with uid: %s" % uid) returnValue((None)) returnValue((data))
class NodeService(ClientServiceBase): ''' client节点 ''' servicename = 'observer.creditor.active_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.user_agent = cfg.http_agent self.max_agent = cfg.max_agent self.agent_pool = TimedAgentPool() self.last_clear = 0 self.ready = True def addAgent(self, seq): ''' 添加一个新的agent到agent_pool ''' agent = getAgent(self.proxy, self.user_agent) agent.remove = False agent.seq = seq self.agent_pool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: seq = 0 while len(self.agent_pool.agents) < self.max_agent: seq += 1 self.addAgent(seq) yield wait(10.) @inlineCallbacks def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'extract') log.info(repr(task)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break @inlineCallbacks def getContent(self, agent, task): ''' download the target page ''' tbody = task.tbody req_url = urllib2.urlparse.urljoin(tbody.get('prefix'), tbody.get('suffix')) url = req_url % (tbody.get('ccode'), tbody.get('page')) log.debug('Getting data with url: %s' % url) result = yield request(agent, url) returnValue(result) @staticmethod def parse_pages(el): ''' ''' try: mmc = el.xpath("./div[@class='page_header clearfix']")[0] result = filter( lambda x: x.isdigit(), mmc.xpath('./div[@class="l_content"]/a/text()'))[-1] except IndexError: result = -2 return result @staticmethod def parse_shop_url(el): try: href = el.xpath("./td/a/@href")[0] except IndexError: href = "" return href @staticmethod def parse_items(el): ''' ''' items = el.xpath("//div[@class='page_item']") infos = map( lambda x: x.xpath( "./table/tbody/tr/td[2]/table[@class='shopinfo']/tr"), items) hrefs = filter(lambda x: x, map(lambda x: NodeService.parse_shop_url(x[0]), infos)) return hrefs @inlineCallbacks def search(self, agent, task): ''' 获取商铺信息列表 ''' task = cPickle.loads(task) pages, hrefs = -1, None try: data = yield self.getContent(agent, task) el = lxml.etree.HTML(data) mc = el.xpath( "//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']" )[0] pages = NodeService.parse_pages(mc) hrefs = map( lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x ), NodeService.parse_items(mc)) except Exception as msg: log.debug("Got Something Wrong with url: %s Error: %s" % (repr(task), repr(msg))) returnValue((pages, json.dumps(hrefs)))
class NodeService(ClientServiceBase): ''' client节点 ''' servicename = 'observer.creditor.active_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.user_agent = cfg.http_agent self.max_agent = cfg.max_agent self.agent_pool = TimedAgentPool() self.last_clear = 0 self.ready = True def addAgent(self, seq): ''' 添加一个新的agent到agent_pool ''' agent = getAgent(self.proxy, self.user_agent) agent.remove = False agent.seq = seq self.agent_pool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: seq = 0 while len(self.agent_pool.agents) < self.max_agent: seq += 1 self.addAgent(seq) yield wait(10.) @inlineCallbacks def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'data') log.info('Got Task %s with reqid: %s' % (repr(task), reqid)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break @inlineCallbacks def getContent(self, agent, task): ''' download the target page ''' task = cPickle.loads(task) #task = json.loads(task) tbody = task.tbody url = tbody.get('task') log.debug('Getting data with url: %s' % url) result = yield request(agent, url) returnValue((result, tbody.get('prefix'))) @inlineCallbacks def search(self, agent, task): ''' 获取商铺信息列表 ''' result = {} try: data, url = yield self.getContent(agent, task) el = lxml.etree.HTML(data) #mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0] mc = el.xpath("//div[@class='r_sub_box']/div[@class='content_detail']") try: basic_info = NodeService.parse_basic_info(mc[0]) except IndexError: basic_info = {} try: intro_info = NodeService.parse_intro_info(mc[1]) except IndexError: intro_info = {} try: extra_info = NodeService.parse_extra_info(mc[2]) for u in extra_info['extra']: u['link'] = urllib2.urlparse.urljoin(url, u['link']) except IndexError: extra_info = {} result.update(basic_info) result.update(intro_info) result.update(extra_info) except Exception as msg: log.debug("Got Something Wrong with Task: %s Error: %s" % (repr(task), repr(msg))) returnValue(result) @staticmethod def parse_basic_info(el): result = {} try: result['pic'] = el.xpath('//div[@id="item"]/a/img/@src')[0] result['title'] = el.xpath("./div[@class='r_content']/div[@class='title']/text()")[0].encode('latin1') except IndexError: pass trs = el.xpath("./div[@class='r_content']/div[@class='list']/table/tr") keys = ['avg', 'tel', 'address', 'date', 'payload', 'best_seller'] for i, tr in enumerate(trs): try: key = keys[i] if key in ['tel', 'address', 'payload', 'best_seller']: result[key] = tr.xpath('./td')[1].xpath('./text()')[0].encode('latin1') continue if key == 'date': result[key] = tr.xpath('./td')[1].xpath('./text()') continue result[key] = tr.xpath('./td/span/text()')[0].encode('latin1') except IndexError: pass return result @staticmethod def parse_intro_info(el): result = {} trs = el.xpath('./div[@class="list no_margintop"]/table/tr') keys = ['company_intro', 'preferential', 'card_detail', 'parking', 'buss'] for i, tr in enumerate(trs): try: key = keys[i] if key == 'buss': result[key] = tr.xpath('./td/div[@class="left"]/text()')[0].encode('latin1') continue result[key] = tr.xpath('./td')[1].xpath('./text()')[0].encode('latin1') except IndexError: pass return result @staticmethod def parse_extra_info(el): hs = [] hrefs = el.xpath('./div[@class="other"]/a') for href in hrefs: try: hs.append({'link': href.xpath('./@href')[0].encode('latin1'), 'name': href.xpath('./text()')[0].encode('latin1')}) except IndexError: pass return {'extra': hs}
class NodeService(ClientServiceBase): ''' client节点 ''' servicename = 'observer.taobao.active_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.user_agent = cfg.http_agent self.max_agent = cfg.max_agent self.agent_pool = TimedAgentPool() self.last_clear = 0 self.ready = True def addAgent(self, seq): ''' 添加一个新的agent到agent_pool ''' agent = getAgent(self.proxy, self.user_agent) agent.remove = False agent.seq = seq self.agent_pool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: seq = 0 while len(self.agent_pool.agents) < self.max_agent: seq += 1 self.addAgent(seq) yield wait(10.) @inlineCallbacks def searchLoop(self, agent): ''' ''' needbreak = False while 1: if agent.remove: self.agent_pool.removeAgent(agent) break reqid, tid = yield self.callController('nextRequest', 'extract') log.info('Got tid %s from server' % tid) try: result = yield self.search(agent, tid) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", tid=tid) result = None needbreak = True except: log.exception() result = None self.callController('sendResult', reqid, tid, result) if needbreak: break @inlineCallbacks def getContent(self, agent, uid, token): ''' get the target webpage ''' url = "%s%s" % (uid, SUFFIX) log.debug('Getting data with url: %s' % url) result = yield request(agent, url) returnValue(result) @inlineCallbacks def search(self, agent, tid, token): ''' 获取商铺信息列表 ''' result = None try: data = yield self.getContent(agent, tid, token) result = json.loads(data).get('ids') except Exception as msg: log.debug("Got Something Wrong with tid: %s Error: %s" % (tid, repr(msg))) returnValue(json.dumps(result))
class NodeService(ClientServiceBase): ''' client节点 ''' servicename = 'observer.creditor.active_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.user_agent = cfg.http_agent self.max_agent = cfg.max_agent self.agent_pool = TimedAgentPool() self.last_clear = 0 self.ready = True def addAgent(self, seq): ''' 添加一个新的agent到agent_pool ''' agent = getAgent(self.proxy, self.user_agent) agent.remove = False agent.seq = seq self.agent_pool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: seq = 0 while len(self.agent_pool.agents) < self.max_agent: seq += 1 self.addAgent(seq) yield wait(10.) @inlineCallbacks def searchLoop(self, agent): ''' ''' needbreak = False while 1: result = None if agent.remove: self.agent_pool.removeAgent(agent) break reqid, task = yield self.callController('nextRequest', 'extract') log.info(repr(task)) try: result = yield self.search(agent, task) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", task=task) needbreak = True except: log.exception() self.callController('sendResult', reqid, task, json.dumps(result)) if needbreak: break @inlineCallbacks def getContent(self, agent, task): ''' download the target page ''' tbody = task.tbody req_url = urllib2.urlparse.urljoin(tbody.get('prefix'), tbody.get('suffix')) url = req_url % (tbody.get('ccode'), tbody.get('page')) log.debug('Getting data with url: %s' % url) result = yield request(agent, url) returnValue(result) @staticmethod def parse_pages(el): ''' ''' try: mmc = el.xpath("./div[@class='page_header clearfix']")[0] result = filter(lambda x: x.isdigit(), mmc.xpath('./div[@class="l_content"]/a/text()'))[-1] except IndexError: result = -2 return result @staticmethod def parse_shop_url(el): try: href = el.xpath("./td/a/@href")[0] except IndexError: href = "" return href @staticmethod def parse_items(el): ''' ''' items = el.xpath("//div[@class='page_item']") infos = map(lambda x: x.xpath("./table/tbody/tr/td[2]/table[@class='shopinfo']/tr"), items) hrefs = filter(lambda x: x, map(lambda x: NodeService.parse_shop_url(x[0]), infos)) return hrefs @inlineCallbacks def search(self, agent, task): ''' 获取商铺信息列表 ''' task = cPickle.loads(task) pages, hrefs = -1, None try: data = yield self.getContent(agent, task) el = lxml.etree.HTML(data) mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0] pages = NodeService.parse_pages(mc) hrefs = map(lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x), NodeService.parse_items(mc)) except Exception as msg: log.debug("Got Something Wrong with url: %s Error: %s" % (repr(task), repr(msg))) returnValue((pages, json.dumps(hrefs)))
class NodeService(ClientServiceBase): ''' sweibo fetcher node ''' servicename = 'observer.sina.weibo.backtracking_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.userAgent = cfg.http_agent self.interval_min = cfg.http_interval_min self.interval_max = cfg.http_interval_max self.avg_interval = (self.interval_min + self.interval_max) / 2.0 self.login_interval = cfg.login_interval self.max_agent = cfg.max_agent self.agentPool = TimedAgentPool( self.interval_min, self.interval_max, self.login_interval, ) self.last_clear = 0 self.count = int((3600.0*2)/self.avg_interval) self.ready = False def addAgent(self, username, password, seq): ''' Add a new agent to the pool. In fact, an agent is a username & password of sina weibo. ''' cookies = CookieJar() agent = getAgent( username, password, self.proxy, self.userAgent, cookies, ) agent.remove = False agent.seq = seq self.agentPool.initAgent(agent) self.searchLoop(agent) @staticmethod def getRegion(province, city): ''' ''' return 'custom:%d:%d' % (province, city) @staticmethod def timeScopeStr(t): ''' ''' return t.strftime('%Y-%m-%d-') + str(t.hour) @staticmethod def getTimeScope(begintime, endtime): ''' ''' return 'custom:%s:%s' % ( NodeService.timeScopeStr(begintime), NodeService.timeScopeStr(endtime), ) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: while len(self.agentPool.agents) < self.max_agent: username, password, seq = heappop(users) if username is not None: self.addAgent(username, password, seq) yield wait(self.login_interval) yield wait(60.0) @inlineCallbacks def searchLoop(self, agent=None): #FIXME REFACTOR THIS METHOD needbreak = False while True: if agent.remove: self.agentPool.removeAgent(agent) break reqid, keyword, page, timescope, region, skid = yield self.callController('nextRequest') begintime = timescope.get('start_date') endtime = timescope.get('end_date') timescope = self.getTimeScope(begintime, endtime-ONE_HOUR) try: result = yield self.search( agent=agent, keyword=keyword, page=page, timescope=timescope, region=region, ) except InfiniteLoginError as msg: log.exception() yield self.callController("fail", agent.username, kid=skid) result = None needbreak = True except: log.exception() result = None log.info("Sending Data to server with kid %s" % skid) self.callController('sendResult', reqid, skid, result) log.info("Sent Data to server with kid %s" % skid) yield wait(random.uniform(self.interval_min, self.interval_max)) if needbreak: break @inlineCallbacks def getContent(self, agent, keyword, page, timescope, region): ''' ''' url = 'http://s.weibo.com/weibo/%s&Refer=index&page=%d' % ( quote_plus(quote_plus(keyword)), page, ) if region is not None: url += '®ion=' + region if timescope is not None: url += '×cope=' + timescope url += '&nodup=1' result = yield request(agent, url) returnValue(result) @staticmethod def getFeedsHtml(content): """ 新浪微博搜索结果页使用JS填充搜索内容。 填充内容放在STK.pageletM.view的调用参数里面,其中pid为pl_weibo_feedlist的 调用中,字典里的html对应的键值保存了要显示的html内容。 """ for m in WEIBO_FEED_PATTERN.finditer(content): d = json.loads(m.group('js')) if d['pid'] in ['pl_weibo_direct', 'pl_weibo_feedlist']: return d['html'] return None @staticmethod def getText(el): """ 提取微博正文中的文本。 所有的 替换成空格 表情图片替换成[表情]。 """ has_link = False text = '' if el.tag == 'img' and el.get('type', None) == 'face': text += el.get('alt', '') elif el.tag == 'a' and el.get('mt','') == 'url': has_link = True t = el.text if t is not None: text += t.replace(u' ', u' ') for e in el.iterchildren(): t,hl = NodeService.getText(e) text += t has_link = has_link or hl t = el.tail if t is not None: text += t.replace(u' ', u' ') return text, has_link @staticmethod def parseTextContent(el): """ 微博文字内容中包含的信息 name: 用户名 isV: 是否认证用户 haslink: 是否有链接 uid: 用户id text: 内容 """ d = {} d['name'] = unicode(el.xpath('./a/@nick-name')[0]) d['isV'] = bool(el.xpath("./a/img[@class='approve' or @class='approve_co']")) text_el= el.xpath("./em")[0] d['text'], d['haslink'] = NodeService.getText(text_el) return d @staticmethod def parsePic(el): """ 提取图片信息 """ imgl = el.xpath("./ul[@class='piclist']/li/img[@action-type='feed_list_media_img']/@src") if len(imgl) == 0: return None return unicode(imgl[0]) @staticmethod def parseForwardedPic(el): """ 提取图片信息 """ imgl = el.xpath("./dd/ul[@class='piclist']/li/img[@action-type='feed_list_media_img']/@src") if len(imgl) == 0: return None return unicode(imgl[0]) @staticmethod def getCount(text): m = WEIBO_COUNT_PATTERN.search(text) if m: return int(m.group('count')) return 0 @staticmethod def parseMiscInfo(el): """ 提取微博的评论数、转发数以及发表时间和来源,从微博地址获取mid """ e = el.xpath("./*[@class='info W_linkb W_textb']")[0] d = { 'ccount': 0, 'rcount': 0, 'source': '', } al = e.xpath("./span/a") for a in al: if a.get('action-type', '') == 'feed_list_forward': d['rcount'] = NodeService.getCount(a.text) if a.get('action-type', '') == 'feed_list_comment': d['ccount'] = NodeService.getCount(a.text) d['cdate'] = int( e.xpath("./a[@node-type='feed_list_item_date']/@date")[0]) hrefl = unicode( e.xpath("./a[@node-type='feed_list_item_date']/@href")[0] ).split('/') mid = url_decode(hrefl[-1]) d['uid'] = int(hrefl[-2]) d['source'] = unicode(e.xpath("./a[last()]/text()")[0]) return d, mid @staticmethod def parseContent(el): """ 提取内容信息 """ d = NodeService.parseTextContent( el.xpath("./p[@node-type='feed_list_content']")[0]) d['pic'] = NodeService.parsePic(el) md, mid = NodeService.parseMiscInfo(el) d['hot'] = bool(el.xpath("./div[@class='hot_feed']")) d.update(md) return d @staticmethod def parseForwardContent(el): """ 提取转发内容信息 """ try: log.debug("parsing forwarded content...") d = NodeService.parseTextContent( el.xpath("./dt[@node-type='feed_list_forwardContent']")[0]) d['pic'] = NodeService.parseForwardedPic(el) md, mid = NodeService.parseMiscInfo(el) d.update(md) d['mid'] = mid return d except: # 过滤被删除微博 log.exception() return None @staticmethod def parseFace(el): """ 获取头像信息 """ imgl = el.xpath("./a/img/@src") if len(imgl) == 0: return None return unicode(imgl[0]) @staticmethod def parseFeed(el): """ 获取微博信息 _id: 微博id ccount: 微博的评论数 cdate: 发布时间 fcount: 粉丝数 (无) flash: 曝光数,暂等于粉丝数(无) gender:性别(无) haslink: 微博内容是否有链接 hot: 是否热门微博 isV: 发表者是否加V loc: 发表者所在地(无) name: 发表者昵称 pan: 正负面(无) pan_ot:修正后的正负面(无) pic: 内容中的图片 rcount: 转发数 retweet: { 原微博信息 rcount: isV: uid: ccount: source: date: text: pic: id: s_name: } segment:分词(无) source: 来源 text: 内容 uid: 用户id uimg: 用户头像 """ d = { 'fcount': None, 'flash': None, 'gender': None, 'loc': None, 'pan': None, 'pan_ot': None, 'segment': None, } try: mid = el.get('mid', None) if mid is not None: mid = int(mid) else: return None d['_id'] = mid d['id'] = mid d['uimg'] = NodeService.parseFace( el.xpath("./dt[@class='face']")[0]) ce = el.xpath("./dd[@class='content']")[0] d.update(NodeService.parseContent(ce)) rl = ce.xpath("./dl/dt[@node-type='feed_list_forwardContent']") d['retweet'] = { 'rcount': None, 'isV': None, 'uid': None, 'ccount': None, 'text': None, 'source': None, 'date': None, 'pic': None, 'id': None, 's_name': None, } if rl: e = rl[0].getparent() fd = NodeService.parseForwardContent(e) if fd is not None: rd = d['retweet'] rd['rcount'] = fd['rcount'] rd['isV'] = fd['isV'] rd['uid'] = fd['uid'] rd['ccount'] = fd['ccount'] rd['text'] = fd['text'] rd['source'] = fd['source'] rd['id'] = fd['mid'] rd['pic'] = fd['pic'] rd['s_name'] = fd['name'] rd['date'] = fd['cdate'] return d except: log.error('error when parsing: ' + lxml.etree.tostring(el)) log.exception() return None @staticmethod def parseUser(el): ''' _id: user_id ./div[@class='person_pic']/a/img/@uid 1 avatar: ./div[@class='person_pic']/a/img/@src 1 nickname: ./div[@class='person_detail']/p/[@class='person_name']/a/text() 1 gender: ./div[@class='person_detail']/p/[@class='person_addr']/span[1]/@title 1 location: ./div[@class='person_detail']/p/[@class='person_addr']/span[2]/text() ''' @staticmethod def getUsers(html_content): ''' <dl action-type='feeds_list_item'>下存放的是微博内容 ''' html = lxml.etree.HTML(html_content) el = html.xpath("//div[@class='pl_noresult']") if el: return [], 0 el = html.xpath("//div[@class='pl_personlist']/div") users = filter( lambda e:e is not None, map(NodeService.parseUser, el) ) tpl = html.xpath("//ul[@class='search_page_M']/li[last()-1]/a/text()") tps = 0 if tpl: try: tps = int(tpl[0]) except: log.info("No page found: " + repr(tpl)) return users, tps @staticmethod def getFeeds(html_content): """ <dl action-type="feed_list_item">下存放的是微博内容 """ html = lxml.etree.HTML(html_content) el = html.xpath("//div[@class='pl_noresult']") if el: return [], 0 el = html.xpath("//dl[@action-type='feed_list_item']") feeds = filter( lambda e: e is not None, map(NodeService.parseFeed, el) ) #查找最后一页的页数,如果已经在最后一页,那么取到的是上一页的页码 #如果只有一页,那么取不到此元素 #log.debug('html: %s'%lxml.etree.tostring(html)) tpl = html.xpath("//ul[@class='search_page_M']/li[last()-1]/a/text()") tps = 0 if tpl: try: tps = int(tpl[0]) except: tps = 0 log.info('no page found: '+repr(tpl)) pass return feeds, tps @inlineCallbacks def _searchHour(self, agent, keyword, statuses, t): ''' ''' timescope = self.getTimeScope(t, t) astatuses = set() haveResult, feeds = yield self._search( agent, keyword, statuses, None, timescope, ) if haveResult: returnValue(feeds) return #feeds = [] for province in cities: feeds += yield self._searchProvince( agent, keyword, statuses, astatuses, timescope, province, ) returnValue(feeds) @inlineCallbacks def _searchProvince( self, agent, keyword, statuses, astatuses, timescope, province, ): region = self.getRegion(province, 1000) if len(cities[province]) > 1: haveResult, feeds = yield self._search( agent, keyword, statuses, astatuses, timescope, region, ) else: haveResult, feeds = yield self._search( agent, keyword, statuses, astatuses, timescope, region, True, ) if haveResult: returnValue(feeds) return feeds = [] for city, _ in cities[province][1:]: region = self.getRegion(province, city) try: _, fs = yield self._search( agent, keyword, statuses, astatuses, timescope, region, True, ) if fs: feeds += fs except: break returnValue(feeds) @inlineCallbacks def search(self, agent, keyword, page, timescope=None, region=None, skid=None): ''' ''' skeyword = keyword.encode('utf8') page_content = yield self.getContent(agent, skeyword, page, timescope, region) html_content = self.getFeedsHtml(page_content) if html_content is None: log.debug("Got feed content: " + str(page_content)) returnValue((None, None)) feeds, tp = self.getFeeds(html_content) map(lambda x: x.update({'kid': skid}), feeds) log.info("GotFeeds: %d, page: %d/%d" % (len(feeds), page, tp)) returnValue((tp, feeds))
class NodeService(ClientServiceBase): ''' ''' servicename = 'observer.sina.users.user_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.userAgent = cfg.http_agent self.max_agent = cfg.max_agent self.agentPool = TimedAgentPool() self.token = self.get_token() self.last_clear = 0 self.ready = True def get_token(self): ''' 获取一个可用的token''' url = 'http://insight.bestminr.com/get_token' return json.loads(urllib.urlopen(url).read()).get('access_token') def addAgent(self, seq): ''' 添加一个新的agent到agentPool ''' agent = getAgent(self.proxy, self.userAgent) agent.remove = False agent.seq = seq self.agentPool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def fillAgents(self): ''' ''' while 1: seq = 0 while len(self.agentPool.agents) < self.max_agent: seq += 1 self.addAgent(seq) yield wait(1.) @inlineCallbacks def searchLoop(self, agent): ''' ''' needbreak = False while 1: if agent.remove: self.agentPool.removeAgent(agent) break reqid, uid = yield self.callController('nextRequest', 'user') log.info('Got uid %s from server' % uid) try: result = yield self.search(agent, uid, self.token) log.debug('Got data %s' % repr(result)) except InfiniteLoginError: log.exception() yield self.callController("fail", uid=uid) result = None needbreak = True except: log.exception() result = None self.callController('sendResult', reqid, uid, result) if needbreak: break @inlineCallbacks def getContent(self, agent, uid, token): ''' ''' result = [] try: url = 'https://api.weibo.com/2/statuses/user_timeline.json?uid=%s&access_token=%s' % ( uid, token) log.debug('Getting data with url: %s' % url) result = yield request(agent, url) except: pass returnValue(result) @inlineCallbacks def search(self, agent, uid, token): ''' ''' result = None try: data = yield self.getContent(agent, uid, token) result = json.loads(data).get('statuses', []) except Exception as msg: log.debug("Got Something Wrong with uid: %s, Error: %s" % (uid, repr(msg))) returnValue(None) returnValue(result)
class NodeService(ClientServiceBase): ''' sweibo fetcher node ''' servicename = 'observer.sina.weibo.user_active_spider' def __init__(self, *args, **kwargs): ''' ''' ClientServiceBase.__init__(self, *args, **kwargs) cfg = kwargs['cfg'] self.name = socket.gethostname() + cfg.prefix # node name self.proxy = cfg.http_proxy # not used self.userAgent = cfg.http_agent self.interval_min = cfg.http_interval_min self.interval_max = cfg.http_interval_max self.avg_interval = (self.interval_min + self.interval_max) / 2.0 self.login_interval = cfg.login_interval self.max_agent = cfg.max_agent self.agentPool = TimedAgentPool( self.interval_min, self.interval_max, self.login_interval, ) self.last_clear = 0 self.count = int(3600.0 * 2 / self.avg_interval) self.ready = False def addAgent(self, username, password, seq): ''' Add a new agent to the pool. In fact, an agent is a username & password of sina weibo. ''' cookies = CookieJar() agent = getAgent( username, password, self.proxy, self.userAgent, cookies, ) agent.remove = False agent.seq = seq self.agentPool.initAgent(agent) self.searchLoop(agent) @inlineCallbacks def startService(self): ''' start the fetch service ''' os.environ['TZ'] = 'PRC' time.tzset() yield ClientServiceBase.startService(self) self.fillAgents() @inlineCallbacks def getUser(self): ''' get a user by a remotecall ''' username, password, seq = yield self.callController('getUser') returnValue((username, password, seq)) @inlineCallbacks def checkAgents(self): #FIXME Should think about how to realize this function while True: for agent in self.agentPool.agents: if not agent.remove: validated = yield self.callController( 'refresh', agent.username, agent.seq, ) if not validated: agent.remove = True yield wait(300.0) @inlineCallbacks def fillAgents(self): ''' ''' while True: while len(self.agentPool.agents) < self.max_agent: username, password, seq = yield self.getUser() if username is not None: self.addAgent(username, password, seq) yield wait(self.login_interval) yield wait(60.0) @inlineCallbacks def searchLoop(self, agent=None): #FIXME REFACTOR THIS METHOD needbreak = False while True: kwg = {} if agent.remove: self.agentPool.removeAgent(agent) break requestid, _, _, kwg = yield self.controllerPull() if not requestid: wait(random.uniform(self.interval_min, self.interval_max)) break try: result = yield self.search(kwg.get('uid'), agent) except InfiniteLoginError as msg: log.exception() kwg.update({'reason': str(msg)}) yield self.controllerFail( 'fail', *(requestid, self.servicename, self.clientid), **kwg ) result = None needbreak = True except: log.exception() result = None kwg.update({'data': result or []}) self.controllerPush( 'push', *(requestid, self.servicename, self.clientid), **kwg ) wait(random.uniform(self.interval_min, self.interval_max)) if needbreak: break @inlineCallbacks def getContent(self, agent, uid): ''' ''' url = 'http://weibo.com/p/100505%s/info?from=page_100505&mod=TAB#place' % (uid, ) result = yield request(agent, url) returnValue(result) @staticmethod def getFeedsHtml(content): """ 新浪用户结果使用js填充页面。 填充内容在FM.view的调用参数里面,其中domid 为 [ Pl_Official_LeftInfo__16, Pl_Official_LeftInfo__17, ] 的调用中, 字典里的html对应的键值对保存了要现实的html内容。 """ JS_DOMID = ['Pl_Official_LeftInfo__16', 'Pl_Official_LeftInfo__17'] for m in WEIBO_FEED_PATTERN.finditer(content): d = json.loads(m.group('js')) if d['domid'] in JS_DOMID: return d['html'] return None @staticmethod def getFeeds(html_content): ''' div[@class='infoblock'] 下存放的是个人基本信息 暂定为: { 'name': { 'key': //div[@class='infoblock'][1]/div[1]/div[1]//text(), 'value': //div[@class='infoblock'][1]/div[1]/div[2]//text() # 0 }, 'location': { 'key': //div[@class='infoblock'][1]/div[2]/div[1]//text()") 'value': //div[@class='infoblock'][1]/div[2]/div[2]//text()") # 0 }, 'gender': { 'key': //div[@class='infoblock'][1]/div[3]/div[1]//text()"), 'value': //div[@class='infoblock'][1]/div[3]/div[2]//text()") }, 'birthday': { 'key': //div[@class='infoblock'][1]/div[4]/div[1]//text()"), 'value': //div[@class='infoblock'][1]/div[4]/div[2]//text()") # 19xx年x月x日 }, 'blog': { 'key': //div[@class='infoblock'][1]/div[5]/div[1]//text()") 'value': //div[@class='infoblock'][1]/div[5]/div[2]//text()") # 1 }, 'domain': { 'key': //div[@class='infoblock'][1]/div[6]/div[1]//text()"), 'value': //div[@class='infoblock'][1]/div[6]/div[2]//text()") # 1 }, 'description': { 'key': //div[@class='infoblock'][7]/div[3]/div[1]//text()"), 'value': //div[@class='infoblock'][7]/div[3]/div[2]//text()") }, } ''' print html_content html = lxml.etree.HTML(html_content) el = html.xpath("//div[@class='infoblock']") if not el: return [], 0 template = { u'昵称': 'name', u'真实姓名': 'rname', u'所在地': 'location', u'性别': 'gender', u'生日': 'birthday', # 19xx年x月x日 u'星座': 'constellation', u"性取向": 'strend', u"感情状况": 'emotional', u"血型": 'blood', u'博客': 'blog', u'个性域名': 'domain', u'简介': 'description', u"邮箱": "email", u"QQ": 'qq', u"MSN": "msn", u"公司": 'career', u"大学": 'colleage', u"高中": 'high_school', u"中专技校": "tecnical_school", u"初中": 'middle_school', u"小学": 'primary_school', u"标签": 'tags', u"认证原因": 'verified_reason', } # in fact, I can do more here, but now, I only need a counter count = html.xpath("//form[@class='info_title'][1]/fieldset/legend//text()") key_xpath_template = "//div[@class='infoblock'][%s]/div[%s]/div[1]//text()" val_xpath_template = "//div[@class='infoblock'][%s]/div[%s]/div[2]//text()" d = dict.fromkeys(template.values(), None) for i in range(1, len(count)+1): #FIXME DO PARSEING for j in range(1, 15): k = html.xpath(key_xpath_template % (i, j)) if len(k) > 0: key = k[0] v = html.xpath(val_xpath_template % (i, j)) if key in template: vals = filter(lambda x: x, map(lambda x: x.strip(), v)) if vals: if len(vals) > 1: d[template[key]] = vals else: d[template[key]] = vals[0] else: pass return d @inlineCallbacks def search(self, uid, agent): ''' ''' _, feeds = yield self._search(uid, agent) returnValue(feeds) @inlineCallbacks def _search(self, uid, agent): ''' ''' log.debug("Fetched_uid: %s." + str(uid)) page_content = yield self.getContent(agent, uid) try: html_content = self.getFeedsHtml(page_content) except ValueError: returnValue((False, None)) return if html_content is None: log.debug('got feed content: ' + page_content) data = self.getFeeds(html_content) returnValue((True, data))