示例#1
0
    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix  # node name

        self.proxy = cfg.http_proxy  # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True
示例#2
0
    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True
示例#3
0
    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.userAgent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agentPool = TimedAgentPool(self.interval_min,
                                        self.interval_max,
                                        self.login_interval)
        self.token = self.get_token()
        self.last_clear = 0
        self.ready = True
    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.userAgent = cfg.http_agent
        self.interval_min = cfg.http_interval_min
        self.interval_max = cfg.http_interval_max
        self.avg_interval = (self.interval_min + self.interval_max) / 2.0
        self.login_interval = cfg.login_interval
        self.max_agent = cfg.max_agent

        self.agentPool = TimedAgentPool(
            self.interval_min,
            self.interval_max,
            self.login_interval,
        )

        self.last_clear = 0
        self.count = int((3600.0*2)/self.avg_interval)
        self.ready = False
示例#5
0
class NodeService(ClientServiceBase):
    ''' '''

    servicename = 'observer.sina.users.user_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.userAgent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agentPool = TimedAgentPool(self.interval_min,
                                        self.interval_max,
                                        self.login_interval)
        self.token = self.get_token()
        self.last_clear = 0
        self.ready = True

    def get_token(self):
        ''' 获取一个可用的token'''
        url = 'http://insight.bestminr.com/get_token'
        return json.loads(urllib.urlopen(url).read()).get('access_token')

    def addAgent(self, seq):
        ''' 添加一个新的agent到agentPool '''
        agent = getAgent(self.proxy, self.userAgent)
        agent.remove = False
        agent.seq = seq
        self.agentPool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agentPool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(10.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            if agent.remove:
                self.agentPool.removeAgent(agent)
                break
            reqid, uid = yield self.callController('nextRequest', 'extract')
            log.info('Got uid %s from server' % uid)

            try:
                result = yield self.search(agent, uid)
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", uid=uid)
                result = None
                needbreak = True
            except:
                log.exception()
                result = None
            self.callController('sendResult', reqid, uid, result)
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, uid, token):
        ''' '''
        url = 'https://api.weibo.com/2/friendships/friends/ids.json?uid=%s&access_token=%s' % (uid, token)
        result = yield request(agent, url)
        returnValue(result)

    @inlineCallbacks
    def search(self, agent, uid, token):
        ''' '''
        data = yield self.getContent(agent, uid, token)

        if data is None:
            log.debug("Got Something Wrong with uid: %s" % uid)
            returnValue((None))

        returnValue((data))
示例#6
0
class NodeService(ClientServiceBase):
    ''' client节点 '''

    servicename = 'observer.creditor.active_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix  # node name

        self.proxy = cfg.http_proxy  # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True

    def addAgent(self, seq):
        ''' 添加一个新的agent到agent_pool '''
        agent = getAgent(self.proxy, self.user_agent)
        agent.remove = False
        agent.seq = seq
        self.agent_pool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agent_pool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(10.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            result = None
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break

            reqid, task = yield self.callController('nextRequest', 'extract')
            log.info(repr(task))

            try:
                result = yield self.search(agent, task)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", task=task)
                needbreak = True
            except:
                log.exception()
            self.callController('sendResult', reqid, task, json.dumps(result))
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, task):
        ''' download the target page '''
        tbody = task.tbody

        req_url = urllib2.urlparse.urljoin(tbody.get('prefix'),
                                           tbody.get('suffix'))
        url = req_url % (tbody.get('ccode'), tbody.get('page'))
        log.debug('Getting data with url: %s' % url)
        result = yield request(agent, url)
        returnValue(result)

    @staticmethod
    def parse_pages(el):
        ''' '''
        try:
            mmc = el.xpath("./div[@class='page_header clearfix']")[0]
            result = filter(
                lambda x: x.isdigit(),
                mmc.xpath('./div[@class="l_content"]/a/text()'))[-1]
        except IndexError:
            result = -2

        return result

    @staticmethod
    def parse_shop_url(el):
        try:
            href = el.xpath("./td/a/@href")[0]
        except IndexError:
            href = ""

        return href

    @staticmethod
    def parse_items(el):
        ''' '''
        items = el.xpath("//div[@class='page_item']")
        infos = map(
            lambda x: x.xpath(
                "./table/tbody/tr/td[2]/table[@class='shopinfo']/tr"), items)
        hrefs = filter(lambda x: x,
                       map(lambda x: NodeService.parse_shop_url(x[0]), infos))

        return hrefs

    @inlineCallbacks
    def search(self, agent, task):
        ''' 获取商铺信息列表 '''
        task = cPickle.loads(task)
        pages, hrefs = -1, None
        try:
            data = yield self.getContent(agent, task)
            el = lxml.etree.HTML(data)
            mc = el.xpath(
                "//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']"
            )[0]
            pages = NodeService.parse_pages(mc)
            hrefs = map(
                lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x
                                                   ),
                NodeService.parse_items(mc))
        except Exception as msg:
            log.debug("Got Something Wrong with url: %s Error: %s" %
                      (repr(task), repr(msg)))

        returnValue((pages, json.dumps(hrefs)))
示例#7
0
class NodeService(ClientServiceBase):
    ''' client节点 '''

    servicename = 'observer.creditor.active_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True

    def addAgent(self, seq):
        ''' 添加一个新的agent到agent_pool '''
        agent = getAgent(self.proxy, self.user_agent)
        agent.remove = False
        agent.seq = seq
        self.agent_pool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agent_pool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(10.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            result = None
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break

            reqid, task = yield self.callController('nextRequest', 'data')
            log.info('Got Task %s with reqid: %s' % (repr(task), reqid))

            try:
                result = yield self.search(agent, task)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", task=task)
                needbreak = True
            except:
                log.exception()
            self.callController('sendResult', reqid, task, json.dumps(result))
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, task):
        ''' download the target page '''
        task = cPickle.loads(task)
        #task = json.loads(task)
        tbody = task.tbody

        url = tbody.get('task')
        log.debug('Getting data with url: %s' % url)

        result = yield request(agent, url)
        returnValue((result, tbody.get('prefix')))

    @inlineCallbacks
    def search(self, agent, task):
        ''' 获取商铺信息列表 '''
        result = {}
        try:
            data, url = yield self.getContent(agent, task)
            el = lxml.etree.HTML(data)
            #mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0]
            mc = el.xpath("//div[@class='r_sub_box']/div[@class='content_detail']")
            try:
                basic_info = NodeService.parse_basic_info(mc[0])
            except IndexError:
                basic_info = {}
            try:
                intro_info = NodeService.parse_intro_info(mc[1])
            except IndexError:
                intro_info = {}
            try:
                extra_info = NodeService.parse_extra_info(mc[2])
                for u in extra_info['extra']:
                    u['link'] = urllib2.urlparse.urljoin(url, u['link'])
            except IndexError:
                extra_info = {}

            result.update(basic_info)
            result.update(intro_info)
            result.update(extra_info)
        except Exception as msg:
            log.debug("Got Something Wrong with Task: %s Error: %s" % (repr(task), repr(msg)))

        returnValue(result)


    @staticmethod
    def parse_basic_info(el):
        result = {}
        try:
            result['pic'] = el.xpath('//div[@id="item"]/a/img/@src')[0]
            result['title'] = el.xpath("./div[@class='r_content']/div[@class='title']/text()")[0].encode('latin1')
        except IndexError:
            pass

        trs = el.xpath("./div[@class='r_content']/div[@class='list']/table/tr")

        keys = ['avg', 'tel', 'address', 'date', 'payload', 'best_seller']

        for i, tr in enumerate(trs):
            try:
                key = keys[i]
                if key in ['tel', 'address', 'payload', 'best_seller']:
                    result[key] = tr.xpath('./td')[1].xpath('./text()')[0].encode('latin1')
                    continue
                if key == 'date':
                    result[key] = tr.xpath('./td')[1].xpath('./text()')
                    continue
                result[key] = tr.xpath('./td/span/text()')[0].encode('latin1')
            except IndexError:
                pass

        return result

    @staticmethod
    def parse_intro_info(el):
        result = {}
        trs = el.xpath('./div[@class="list no_margintop"]/table/tr')

        keys = ['company_intro', 'preferential', 'card_detail', 'parking', 'buss']

        for i, tr in enumerate(trs):
            try:
                key = keys[i]
                if key == 'buss':
                    result[key] = tr.xpath('./td/div[@class="left"]/text()')[0].encode('latin1')
                    continue
                result[key] = tr.xpath('./td')[1].xpath('./text()')[0].encode('latin1')
            except IndexError:
                pass

        return result

    @staticmethod
    def parse_extra_info(el):
        hs = []
        hrefs = el.xpath('./div[@class="other"]/a')
        for href in hrefs:
            try:
                hs.append({'link': href.xpath('./@href')[0].encode('latin1'), 'name': href.xpath('./text()')[0].encode('latin1')})
            except IndexError:
                pass

        return {'extra': hs}
示例#8
0
class NodeService(ClientServiceBase):
    ''' client节点 '''

    servicename = 'observer.taobao.active_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix  # node name

        self.proxy = cfg.http_proxy  # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True

    def addAgent(self, seq):
        ''' 添加一个新的agent到agent_pool '''
        agent = getAgent(self.proxy, self.user_agent)
        agent.remove = False
        agent.seq = seq
        self.agent_pool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agent_pool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(10.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break
            reqid, tid = yield self.callController('nextRequest', 'extract')
            log.info('Got tid %s from server' % tid)

            try:
                result = yield self.search(agent, tid)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", tid=tid)
                result = None
                needbreak = True
            except:
                log.exception()
                result = None
            self.callController('sendResult', reqid, tid, result)
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, uid, token):
        ''' get the target webpage '''
        url = "%s%s" % (uid, SUFFIX)
        log.debug('Getting data with url: %s' % url)
        result = yield request(agent, url)
        returnValue(result)

    @inlineCallbacks
    def search(self, agent, tid, token):
        ''' 获取商铺信息列表 '''
        result = None
        try:
            data = yield self.getContent(agent, tid, token)
            result = json.loads(data).get('ids')
        except Exception as msg:
            log.debug("Got Something Wrong with tid: %s Error: %s" %
                      (tid, repr(msg)))

        returnValue(json.dumps(result))
示例#9
0
class NodeService(ClientServiceBase):
    ''' client节点 '''

    servicename = 'observer.creditor.active_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True

    def addAgent(self, seq):
        ''' 添加一个新的agent到agent_pool '''
        agent = getAgent(self.proxy, self.user_agent)
        agent.remove = False
        agent.seq = seq
        self.agent_pool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agent_pool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(10.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            result = None
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break

            reqid, task = yield self.callController('nextRequest', 'extract')
            log.info(repr(task))

            try:
                result = yield self.search(agent, task)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", task=task)
                needbreak = True
            except:
                log.exception()
            self.callController('sendResult', reqid, task, json.dumps(result))
            if needbreak:
                break


    @inlineCallbacks
    def getContent(self, agent, task):
        ''' download the target page '''
        tbody = task.tbody

        req_url = urllib2.urlparse.urljoin(tbody.get('prefix'),
                                           tbody.get('suffix'))
        url = req_url % (tbody.get('ccode'), tbody.get('page'))
        log.debug('Getting data with url: %s' % url)
        result = yield request(agent, url)
        returnValue(result)

    @staticmethod
    def parse_pages(el):
        ''' '''
        try:
            mmc = el.xpath("./div[@class='page_header clearfix']")[0]
            result = filter(lambda x: x.isdigit(), mmc.xpath('./div[@class="l_content"]/a/text()'))[-1]
        except IndexError:
            result = -2

        return result

    @staticmethod
    def parse_shop_url(el):
        try:
            href = el.xpath("./td/a/@href")[0]
        except IndexError:
            href = ""

        return href

    @staticmethod
    def parse_items(el):
        ''' '''
        items = el.xpath("//div[@class='page_item']")
        infos = map(lambda x: x.xpath("./table/tbody/tr/td[2]/table[@class='shopinfo']/tr"), items)
        hrefs = filter(lambda x: x, map(lambda x: NodeService.parse_shop_url(x[0]), infos))

        return hrefs

    @inlineCallbacks
    def search(self, agent, task):
        ''' 获取商铺信息列表 '''
        task = cPickle.loads(task)
        pages, hrefs = -1, None
        try:
            data = yield self.getContent(agent, task)
            el = lxml.etree.HTML(data)
            mc = el.xpath("//div[@class='r_sub_box']/div[@class='middle_content']/div[@class='page_content clearfix']")[0]
            pages = NodeService.parse_pages(mc)
            hrefs = map(lambda x: urllib2.urlparse.urljoin(task.tbody.get('prefix'), x), NodeService.parse_items(mc))
        except Exception as msg:
            log.debug("Got Something Wrong with url: %s Error: %s" % (repr(task), repr(msg)))

        returnValue((pages, json.dumps(hrefs)))
class NodeService(ClientServiceBase):
    ''' sweibo fetcher node '''

    servicename = 'observer.sina.weibo.backtracking_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.userAgent = cfg.http_agent
        self.interval_min = cfg.http_interval_min
        self.interval_max = cfg.http_interval_max
        self.avg_interval = (self.interval_min + self.interval_max) / 2.0
        self.login_interval = cfg.login_interval
        self.max_agent = cfg.max_agent

        self.agentPool = TimedAgentPool(
            self.interval_min,
            self.interval_max,
            self.login_interval,
        )

        self.last_clear = 0
        self.count = int((3600.0*2)/self.avg_interval)
        self.ready = False

    def addAgent(self, username, password, seq):
        '''
            Add a new agent to the pool.
            In fact, an agent is a username & password of sina weibo.
        '''
        cookies = CookieJar()
        agent = getAgent(
            username,
            password,
            self.proxy,
            self.userAgent,
            cookies,
        )
        agent.remove = False
        agent.seq = seq
        self.agentPool.initAgent(agent)
        self.searchLoop(agent)

    @staticmethod
    def getRegion(province, city):
        ''' '''
        return 'custom:%d:%d' % (province, city)

    @staticmethod
    def timeScopeStr(t):
        ''' '''
        return t.strftime('%Y-%m-%d-') + str(t.hour)

    @staticmethod
    def getTimeScope(begintime, endtime):
        ''' '''
        return 'custom:%s:%s' % (
            NodeService.timeScopeStr(begintime),
            NodeService.timeScopeStr(endtime),
        )

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            while len(self.agentPool.agents) < self.max_agent:
                username, password, seq = heappop(users)
                if username is not None:
                    self.addAgent(username, password, seq)
                yield wait(self.login_interval)
            yield wait(60.0)

    @inlineCallbacks
    def searchLoop(self, agent=None):
        #FIXME REFACTOR THIS METHOD
        needbreak = False
        while True:
            if agent.remove:
                self.agentPool.removeAgent(agent)
                break

            reqid, keyword, page, timescope, region, skid = yield self.callController('nextRequest')

            begintime = timescope.get('start_date')
            endtime = timescope.get('end_date')
            timescope = self.getTimeScope(begintime, endtime-ONE_HOUR)

            try:
                result = yield self.search(
                    agent=agent,
                    keyword=keyword,
                    page=page,
                    timescope=timescope,
                    region=region,
                )
            except InfiniteLoginError as msg:
                log.exception()
                yield self.callController("fail", agent.username, kid=skid)
                result = None
                needbreak = True
            except:
                log.exception()
                result = None

            log.info("Sending Data to server with kid %s" % skid)
            self.callController('sendResult', reqid, skid, result)
            log.info("Sent Data to server with kid %s" % skid)
            yield wait(random.uniform(self.interval_min, self.interval_max))
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, keyword, page, timescope, region):
        ''' '''
        url = 'http://s.weibo.com/weibo/%s&Refer=index&page=%d' % (
            quote_plus(quote_plus(keyword)),
            page,
        )
        if region is not None:
            url += '&region=' + region
        if timescope is not None:
            url += '&timescope=' + timescope
        url += '&nodup=1'
        result = yield request(agent, url)
        returnValue(result)


    @staticmethod
    def getFeedsHtml(content):
        """
        新浪微博搜索结果页使用JS填充搜索内容。
        填充内容放在STK.pageletM.view的调用参数里面,其中pid为pl_weibo_feedlist的
        调用中,字典里的html对应的键值保存了要显示的html内容。
        """
        for m in WEIBO_FEED_PATTERN.finditer(content):
            d = json.loads(m.group('js'))
            if d['pid'] in ['pl_weibo_direct', 'pl_weibo_feedlist']:
                return d['html']
        return None

    @staticmethod
    def getText(el):
        """
        提取微博正文中的文本。
        所有的&nbsp;替换成空格
        表情图片替换成[表情]。
        """
        has_link = False
        text = ''
        if el.tag == 'img' and el.get('type', None) == 'face':
            text += el.get('alt', '')
        elif el.tag == 'a' and el.get('mt','') == 'url':
            has_link = True
        t = el.text
        if t is not None:
            text += t.replace(u'&nbsp;', u' ')
        for e in el.iterchildren():
            t,hl = NodeService.getText(e)
            text += t
            has_link = has_link or hl
        t = el.tail
        if t is not None:
            text += t.replace(u'&nbsp;', u' ')
        return text, has_link

    @staticmethod
    def parseTextContent(el):
        """
        微博文字内容中包含的信息
        name: 用户名
        isV: 是否认证用户
        haslink: 是否有链接
        uid: 用户id
        text: 内容
        """
        d = {}
        d['name'] = unicode(el.xpath('./a/@nick-name')[0])
        d['isV'] = bool(el.xpath("./a/img[@class='approve' or @class='approve_co']"))
        text_el= el.xpath("./em")[0]
        d['text'], d['haslink'] = NodeService.getText(text_el)
        return d

    @staticmethod
    def parsePic(el):
        """
        提取图片信息
        """
        imgl = el.xpath("./ul[@class='piclist']/li/img[@action-type='feed_list_media_img']/@src")
        if len(imgl) == 0:
            return None
        return unicode(imgl[0])

    @staticmethod
    def parseForwardedPic(el):
        """
        提取图片信息
        """
        imgl = el.xpath("./dd/ul[@class='piclist']/li/img[@action-type='feed_list_media_img']/@src")
        if len(imgl) == 0:
            return None
        return unicode(imgl[0])

    @staticmethod
    def getCount(text):
        m = WEIBO_COUNT_PATTERN.search(text)
        if m:
            return int(m.group('count'))
        return 0

    @staticmethod
    def parseMiscInfo(el):
        """
        提取微博的评论数、转发数以及发表时间和来源,从微博地址获取mid
        """
        e = el.xpath("./*[@class='info W_linkb W_textb']")[0]
        d = {
            'ccount': 0,
            'rcount': 0,
            'source': '',
        }
        al = e.xpath("./span/a")
        for a in al:
            if a.get('action-type', '') == 'feed_list_forward':
                d['rcount'] = NodeService.getCount(a.text)
            if a.get('action-type', '') == 'feed_list_comment':
                d['ccount'] = NodeService.getCount(a.text)
        d['cdate'] = int(
            e.xpath("./a[@node-type='feed_list_item_date']/@date")[0])
        hrefl = unicode(
            e.xpath("./a[@node-type='feed_list_item_date']/@href")[0]
        ).split('/')
        mid = url_decode(hrefl[-1])
        d['uid'] = int(hrefl[-2])
        d['source'] = unicode(e.xpath("./a[last()]/text()")[0])
        return d, mid

    @staticmethod
    def parseContent(el):
        """
        提取内容信息
        """
        d = NodeService.parseTextContent(
            el.xpath("./p[@node-type='feed_list_content']")[0])
        d['pic'] = NodeService.parsePic(el)
        md, mid = NodeService.parseMiscInfo(el)
        d['hot'] = bool(el.xpath("./div[@class='hot_feed']"))
        d.update(md)
        return d

    @staticmethod
    def parseForwardContent(el):
        """
        提取转发内容信息
        """
        try:
            log.debug("parsing forwarded content...")
            d = NodeService.parseTextContent(
                el.xpath("./dt[@node-type='feed_list_forwardContent']")[0])
            d['pic'] = NodeService.parseForwardedPic(el)
            md, mid = NodeService.parseMiscInfo(el)
            d.update(md)
            d['mid'] = mid
            return d
        except:     # 过滤被删除微博
            log.exception()
            return None

    @staticmethod
    def parseFace(el):
        """
        获取头像信息
        """
        imgl = el.xpath("./a/img/@src")
        if len(imgl) == 0:
            return None
        return unicode(imgl[0])

    @staticmethod
    def parseFeed(el):
        """
        获取微博信息
        _id: 微博id
        ccount: 微博的评论数
        cdate: 发布时间
        fcount: 粉丝数 (无)
        flash: 曝光数,暂等于粉丝数(无)
        gender:性别(无)
        haslink: 微博内容是否有链接
        hot: 是否热门微博
        isV: 发表者是否加V
        loc: 发表者所在地(无)
        name: 发表者昵称
        pan: 正负面(无)
        pan_ot:修正后的正负面(无)
        pic: 内容中的图片
        rcount: 转发数
        retweet: { 原微博信息
            rcount:
            isV:
            uid:
            ccount:
            source:
            date:
            text:
            pic:
            id:
            s_name:
        }
        segment:分词(无)
        source: 来源
        text: 内容
        uid: 用户id
        uimg: 用户头像
        """
        d = {
            'fcount': None,
            'flash': None,
            'gender': None,
            'loc': None,
            'pan': None,
            'pan_ot': None,
            'segment': None,
        }
        try:
            mid = el.get('mid', None)
            if mid is not None:
                mid = int(mid)
            else:
                return None
            d['_id'] = mid
            d['id'] = mid
            d['uimg'] = NodeService.parseFace(
                el.xpath("./dt[@class='face']")[0])
            ce = el.xpath("./dd[@class='content']")[0]
            d.update(NodeService.parseContent(ce))
            rl = ce.xpath("./dl/dt[@node-type='feed_list_forwardContent']")
            d['retweet'] = {
                'rcount': None,
                'isV': None,
                'uid': None,
                'ccount': None,
                'text': None,
                'source': None,
                'date': None,
                'pic': None,
                'id': None,
                's_name': None,
            }
            if rl:
                e = rl[0].getparent()
                fd = NodeService.parseForwardContent(e)
                if fd is not None:
                    rd = d['retweet']
                    rd['rcount'] = fd['rcount']
                    rd['isV'] = fd['isV']
                    rd['uid'] = fd['uid']
                    rd['ccount'] = fd['ccount']
                    rd['text'] = fd['text']
                    rd['source'] = fd['source']
                    rd['id'] = fd['mid']
                    rd['pic'] = fd['pic']
                    rd['s_name'] = fd['name']
                    rd['date'] = fd['cdate']
            return d
        except:
            log.error('error when parsing: ' + lxml.etree.tostring(el))
            log.exception()
            return None

    @staticmethod
    def parseUser(el):
        '''
            _id: user_id
            ./div[@class='person_pic']/a/img/@uid
            1

            avatar: 
            ./div[@class='person_pic']/a/img/@src
            1

            nickname: 
            ./div[@class='person_detail']/p/[@class='person_name']/a/text()
            1

            gender:
            ./div[@class='person_detail']/p/[@class='person_addr']/span[1]/@title
            1

            location: 
            ./div[@class='person_detail']/p/[@class='person_addr']/span[2]/text()
        '''

    @staticmethod
    def getUsers(html_content):
        '''
            <dl action-type='feeds_list_item'>下存放的是微博内容
        '''
        html = lxml.etree.HTML(html_content)
        el = html.xpath("//div[@class='pl_noresult']")
        if el:
            return [], 0
        el = html.xpath("//div[@class='pl_personlist']/div")
        users = filter(
            lambda e:e is not None,
            map(NodeService.parseUser, el)            
        )
        tpl = html.xpath("//ul[@class='search_page_M']/li[last()-1]/a/text()")
        tps = 0
        if tpl:
            try:
                tps = int(tpl[0])
            except:
                log.info("No page found: " + repr(tpl))
        return users, tps
        

    @staticmethod
    def getFeeds(html_content):
        """
        <dl action-type="feed_list_item">下存放的是微博内容
        """
        html = lxml.etree.HTML(html_content)
        el = html.xpath("//div[@class='pl_noresult']")
        if el:
            return [], 0
        el = html.xpath("//dl[@action-type='feed_list_item']")
        feeds = filter(
            lambda e: e is not None,
            map(NodeService.parseFeed, el)
        )
        #查找最后一页的页数,如果已经在最后一页,那么取到的是上一页的页码
        #如果只有一页,那么取不到此元素
        #log.debug('html: %s'%lxml.etree.tostring(html))
        tpl = html.xpath("//ul[@class='search_page_M']/li[last()-1]/a/text()")
        tps = 0
        if tpl:
            try:
                tps = int(tpl[0])
            except:
                tps = 0
                log.info('no page found: '+repr(tpl))
                pass
        return feeds, tps

    @inlineCallbacks
    def _searchHour(self, agent, keyword, statuses, t):
        ''' '''
        timescope = self.getTimeScope(t, t)
        astatuses = set()
        haveResult, feeds = yield self._search(
            agent,
            keyword,
            statuses,
            None,
            timescope,
        )

        if haveResult:
            returnValue(feeds)
            return
        #feeds = []
        for province in cities:
            feeds += yield self._searchProvince(
                agent,
                keyword,
                statuses,
                astatuses,
                timescope,
                province,
            )
        returnValue(feeds)

    @inlineCallbacks
    def _searchProvince(
        self,
        agent,
        keyword,
        statuses,
        astatuses,
        timescope,
        province,
    ):
        region = self.getRegion(province, 1000)
        if len(cities[province]) > 1:
            haveResult, feeds = yield self._search(
                agent,
                keyword,
                statuses,
                astatuses,
                timescope,
                region,
            )
        else:
            haveResult, feeds = yield self._search(
                agent,
                keyword,
                statuses,
                astatuses,
                timescope,
                region,
                True,
            )

        if haveResult:
            returnValue(feeds)
            return

        feeds = []
        for city, _ in cities[province][1:]:
            region = self.getRegion(province, city)
            try:
                _, fs = yield self._search(
                    agent,
                    keyword,
                    statuses,
                    astatuses,
                    timescope,
                    region,
                    True,
                )
                if fs:
                    feeds += fs
            except:
                break

        returnValue(feeds)

    @inlineCallbacks
    def search(self, agent, keyword, page, timescope=None, region=None, skid=None):
        ''' '''
        skeyword = keyword.encode('utf8')
        page_content = yield self.getContent(agent, skeyword, page, timescope, region)
        html_content = self.getFeedsHtml(page_content)

        if html_content is None:
            log.debug("Got feed content: " + str(page_content))
            returnValue((None, None))

        feeds, tp = self.getFeeds(html_content)
        map(lambda x: x.update({'kid': skid}), feeds)
        log.info("GotFeeds: %d, page: %d/%d" % (len(feeds), page, tp))
        returnValue((tp, feeds))
示例#11
0
class NodeService(ClientServiceBase):
    ''' '''

    servicename = 'observer.sina.users.user_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix  # node name

        self.proxy = cfg.http_proxy  # not used
        self.userAgent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agentPool = TimedAgentPool()
        self.token = self.get_token()
        self.last_clear = 0
        self.ready = True

    def get_token(self):
        ''' 获取一个可用的token'''
        url = 'http://insight.bestminr.com/get_token'
        return json.loads(urllib.urlopen(url).read()).get('access_token')

    def addAgent(self, seq):
        ''' 添加一个新的agent到agentPool '''
        agent = getAgent(self.proxy, self.userAgent)
        agent.remove = False
        agent.seq = seq
        self.agentPool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agentPool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(1.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            if agent.remove:
                self.agentPool.removeAgent(agent)
                break
            reqid, uid = yield self.callController('nextRequest', 'user')
            log.info('Got uid %s from server' % uid)

            try:
                result = yield self.search(agent, uid, self.token)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", uid=uid)
                result = None
                needbreak = True
            except:
                log.exception()
                result = None
            self.callController('sendResult', reqid, uid, result)
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, uid, token):
        ''' '''
        result = []
        try:
            url = 'https://api.weibo.com/2/statuses/user_timeline.json?uid=%s&access_token=%s' % (
                uid, token)
            log.debug('Getting data with url: %s' % url)
            result = yield request(agent, url)
        except:
            pass

        returnValue(result)

    @inlineCallbacks
    def search(self, agent, uid, token):
        ''' '''
        result = None
        try:
            data = yield self.getContent(agent, uid, token)
            result = json.loads(data).get('statuses', [])
        except Exception as msg:
            log.debug("Got Something Wrong with uid: %s, Error: %s" %
                      (uid, repr(msg)))
            returnValue(None)

        returnValue(result)
class NodeService(ClientServiceBase):
    ''' sweibo fetcher node '''

    servicename = 'observer.sina.weibo.user_active_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.userAgent = cfg.http_agent
        self.interval_min = cfg.http_interval_min
        self.interval_max = cfg.http_interval_max
        self.avg_interval = (self.interval_min + self.interval_max) / 2.0
        self.login_interval = cfg.login_interval
        self.max_agent = cfg.max_agent

        self.agentPool = TimedAgentPool(
            self.interval_min,
            self.interval_max,
            self.login_interval,
        )

        self.last_clear = 0
        self.count = int(3600.0 * 2 / self.avg_interval)
        self.ready = False

    def addAgent(self, username, password, seq):
        '''
            Add a new agent to the pool.
            In fact, an agent is a username & password of sina weibo.
        '''
        cookies = CookieJar()
        agent = getAgent(
            username,
            password,
            self.proxy,
            self.userAgent,
            cookies,
        )
        agent.remove = False
        agent.seq = seq
        self.agentPool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def getUser(self):
        ''' get a user by a remotecall '''
        username, password, seq = yield self.callController('getUser')
        returnValue((username, password, seq))

    @inlineCallbacks
    def checkAgents(self):
        #FIXME Should think about how to realize this function
        while True:
            for agent in self.agentPool.agents:
                if not agent.remove:
                    validated = yield self.callController(
                        'refresh',
                        agent.username,
                        agent.seq,
                    )
                    if not validated:
                        agent.remove = True
            yield wait(300.0)

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while True:
            while len(self.agentPool.agents) < self.max_agent:
                username, password, seq = yield self.getUser()
                if username is not None:
                    self.addAgent(username, password, seq)
                yield wait(self.login_interval)
            yield wait(60.0)

    @inlineCallbacks
    def searchLoop(self, agent=None):
        #FIXME REFACTOR THIS METHOD
        needbreak = False
        while True:
            kwg = {}
            if agent.remove:
                self.agentPool.removeAgent(agent)
                break

            requestid, _, _, kwg = yield self.controllerPull()
            if not requestid:
                wait(random.uniform(self.interval_min, self.interval_max))
                break
            try:
                result = yield self.search(kwg.get('uid'), agent)
            except InfiniteLoginError as msg:
                log.exception()
                kwg.update({'reason': str(msg)})
                yield self.controllerFail(
                    'fail',
                    *(requestid, self.servicename, self.clientid),
                    **kwg
                )
                result = None
                needbreak = True
            except:
                log.exception()
                result = None

            kwg.update({'data': result or []})
            self.controllerPush(
                'push',
                *(requestid, self.servicename, self.clientid),
                **kwg
            )
            wait(random.uniform(self.interval_min, self.interval_max))
            if needbreak:
                break

    @inlineCallbacks
    def getContent(self, agent, uid):
        ''' '''
        url = 'http://weibo.com/p/100505%s/info?from=page_100505&mod=TAB#place' % (uid, )
        result = yield request(agent, url)
        returnValue(result)

    @staticmethod
    def getFeedsHtml(content):
        """ 
        新浪用户结果使用js填充页面。
        填充内容在FM.view的调用参数里面,其中domid
        为 
            [
                Pl_Official_LeftInfo__16,
                Pl_Official_LeftInfo__17,
            ]
        的调用中,
        字典里的html对应的键值对保存了要现实的html内容。
        """
        JS_DOMID = ['Pl_Official_LeftInfo__16', 'Pl_Official_LeftInfo__17']
        for m in WEIBO_FEED_PATTERN.finditer(content):
            d = json.loads(m.group('js'))
            if d['domid'] in JS_DOMID:
                return d['html']
        return None

    @staticmethod
    def getFeeds(html_content):
        '''
            div[@class='infoblock'] 下存放的是个人基本信息

            暂定为:
            {
                'name': {
                    'key': //div[@class='infoblock'][1]/div[1]/div[1]//text(),
                    'value': //div[@class='infoblock'][1]/div[1]/div[2]//text()     # 0
                },
                'location': {
                    'key': //div[@class='infoblock'][1]/div[2]/div[1]//text()")
                    'value': //div[@class='infoblock'][1]/div[2]/div[2]//text()")   # 0
                },
                'gender': {
                    'key': //div[@class='infoblock'][1]/div[3]/div[1]//text()"),
                    'value': //div[@class='infoblock'][1]/div[3]/div[2]//text()")
                },
                'birthday': {
                    'key': //div[@class='infoblock'][1]/div[4]/div[1]//text()"),
                    'value': //div[@class='infoblock'][1]/div[4]/div[2]//text()")      # 19xx年x月x日
                },
                'blog': {
                    'key': //div[@class='infoblock'][1]/div[5]/div[1]//text()")
                    'value': //div[@class='infoblock'][1]/div[5]/div[2]//text()")          # 1
                },
                'domain': {
                    'key': //div[@class='infoblock'][1]/div[6]/div[1]//text()"),
                    'value': //div[@class='infoblock'][1]/div[6]/div[2]//text()")        # 1
                },
                'description': {
                    'key': //div[@class='infoblock'][7]/div[3]/div[1]//text()"),
                    'value': //div[@class='infoblock'][7]/div[3]/div[2]//text()")
                },
            }

        '''
        print html_content
        html = lxml.etree.HTML(html_content)
        el = html.xpath("//div[@class='infoblock']") 
        if not el:
            return [], 0

        template = {
            u'昵称': 'name',
            u'真实姓名': 'rname',
            u'所在地': 'location',
            u'性别': 'gender',
            u'生日': 'birthday',    # 19xx年x月x日
            u'星座': 'constellation',
            u"性取向": 'strend',
            u"感情状况": 'emotional',
            u"血型": 'blood',
            u'博客': 'blog',
            u'个性域名': 'domain',
            u'简介': 'description',
            u"邮箱": "email",
            u"QQ": 'qq',
            u"MSN": "msn",
            u"公司": 'career',
            u"大学": 'colleage',
            u"高中": 'high_school',
            u"中专技校": "tecnical_school",
            u"初中": 'middle_school',
            u"小学": 'primary_school',
            u"标签": 'tags',
            u"认证原因": 'verified_reason',
        }


        # in fact, I can do more here, but now, I only need a counter
        count = html.xpath("//form[@class='info_title'][1]/fieldset/legend//text()")

        key_xpath_template = "//div[@class='infoblock'][%s]/div[%s]/div[1]//text()"
        val_xpath_template = "//div[@class='infoblock'][%s]/div[%s]/div[2]//text()"

        d = dict.fromkeys(template.values(), None)

        for i in range(1, len(count)+1):
            #FIXME DO PARSEING
            for j in range(1, 15):
                k = html.xpath(key_xpath_template % (i, j))
                if len(k) > 0:
                    key = k[0]
                    v = html.xpath(val_xpath_template % (i, j))
                    if key in template:
                        vals = filter(lambda x: x, map(lambda x: x.strip(), v))
                        if vals:
                            if len(vals) > 1:
                                d[template[key]] = vals
                            else:
                                d[template[key]] = vals[0]
                else:
                    pass

        return d


    @inlineCallbacks
    def search(self, uid, agent):
        ''' '''
        _, feeds = yield self._search(uid, agent)
        returnValue(feeds)

    @inlineCallbacks
    def _search(self, uid, agent):
        ''' '''

        log.debug("Fetched_uid: %s." + str(uid))
        page_content = yield self.getContent(agent, uid)
        try:
            html_content = self.getFeedsHtml(page_content)
        except ValueError:
            returnValue((False, None))
            return
        if html_content is None:
            log.debug('got feed content: ' + page_content)

        data = self.getFeeds(html_content)
        returnValue((True, data))
示例#13
0
class NodeService(ClientServiceBase):
    ''' client节点 '''

    servicename = 'observer.taobao.active_spider'

    def __init__(self, *args, **kwargs):
        ''' '''

        ClientServiceBase.__init__(self, *args, **kwargs)
        cfg = kwargs['cfg']
        self.name = socket.gethostname() + cfg.prefix    # node name

        self.proxy = cfg.http_proxy     # not used
        self.user_agent = cfg.http_agent
        self.max_agent = cfg.max_agent
        self.agent_pool = TimedAgentPool()
        self.last_clear = 0
        self.ready = True

    def addAgent(self, seq):
        ''' 添加一个新的agent到agent_pool '''
        agent = getAgent(self.proxy, self.user_agent)
        agent.remove = False
        agent.seq = seq
        self.agent_pool.initAgent(agent)
        self.searchLoop(agent)

    @inlineCallbacks
    def startService(self):
        ''' start the fetch service '''
        os.environ['TZ'] = 'PRC'
        time.tzset()
        yield ClientServiceBase.startService(self)
        self.fillAgents()

    @inlineCallbacks
    def fillAgents(self):
        ''' '''
        while 1:
            seq = 0
            while len(self.agent_pool.agents) < self.max_agent:
                seq += 1
                self.addAgent(seq)
            yield wait(10.)

    @inlineCallbacks
    def searchLoop(self, agent):
        ''' '''
        needbreak = False
        while 1:
            if agent.remove:
                self.agent_pool.removeAgent(agent)
                break
            reqid, tid = yield self.callController('nextRequest', 'extract')
            log.info('Got tid %s from server' % tid)

            try:
                result = yield self.search(agent, tid)
                log.debug('Got data %s' % repr(result))
            except InfiniteLoginError:
                log.exception()
                yield self.callController("fail", tid=tid)
                result = None
                needbreak = True
            except:
                log.exception()
                result = None
            self.callController('sendResult', reqid, tid, result)
            if needbreak:
                break


    @inlineCallbacks
    def getContent(self, agent, uid, token):
        ''' get the target webpage '''
        url = "%s%s" % (uid, SUFFIX)
        log.debug('Getting data with url: %s' % url)
        result = yield request(agent, url)
        returnValue(result)

    @inlineCallbacks
    def search(self, agent, tid, token):
        ''' 获取商铺信息列表 '''
        result = None
        try:
            data = yield self.getContent(agent, tid, token)
            result = json.loads(data).get('ids')
        except Exception as msg:
            log.debug("Got Something Wrong with tid: %s Error: %s" % (tid, repr(msg)))

        returnValue(json.dumps(result))