Exemplo n.º 1
0
    def parse_mini_card(self, response):
        if response.status == 404:
            yield response.meta['uItem']
            yield response.meta['item']

        pItem = response.meta['item']
        user = response.xpath('//a[@target="_blank"]/@href').extract()[0]
        uid = user.split('/')[-1]
        pItem['uid'] = uid
        if uid in self.users:
            yield pItem
            return

        self.users.add(uid)
        item = userItem()
        item['disPage'] = pItem['disPage']
        item['uid'] = uid
        name = response.xpath(
            '//h5[@class="userInfo__content-name"]/text()').extract()[0]
        item['firstName'], item['lastName'] = self.getName(name)
        item['source'] = response.urljoin(user)
        request = scrapy.Request(item['source'],
                                 callback=self.parse_profile,
                                 dont_filter=True)
        request.meta['item'] = item
        request.meta['pItem'] = pItem
        yield request
Exemplo n.º 2
0
    def parse_author(self, response, disPage):
        item = userItem()
        item['disPage'] = disPage
        user = response.xpath(
            '//div[@class="author-details"]//a/@href').extract()[0]
        item['uid'] = user.split('/')[-1]
        item['source'] = response.urljoin(user)
        firstName = response.xpath(
            '//span[@class="given-name"]/text()').extract()
        lastName = response.xpath(
            '//span[@class="family-name"]/text()').extract()
        if firstName:
            item['firstName'] = firstName[0]
        if lastName:
            item['lastName'] = lastName[0]

        occupation = response.xpath(
            '//div[@class="agent-details-col"]/div/text()').extract()
        if occupation:
            item['occupation'] = occupation[0].strip()

        rainmaker = response.xpath('//div[@class="agent-mast-img"]/div')
        if rainmaker:
            item['account'], item['points'] = self.getRainmaker(rainmaker)

        location = response.xpath(
            '//div[@id="find_agents"]/p/a/@href').extract()
        if location:
            item['state'], item['city'] = [i.split('/')[-1] for i in location]

        return item
Exemplo n.º 3
0
    def parse_user(self, c, disPage):
        item = userItem()
        item['disPage'] = disPage
        user = c.xpath('div[@class="comment-left-section"]/a/@href').extract()
        if not user:
            user = c.xpath('.//div[@class="comment-author"]/text()').extract()
        item['uid'] = user[0].split('/')[-1]
        item['source'] = 'http://activerain.com' + user[0]
        name = c.xpath('.//div[@class="comment-author"]/text()').extract()[0]
        item['firstName'], item['lastName'] = self.getName(name)
        occupation = c.xpath('.//div[@class="tagline"]/text()').extract()
        if occupation:
            item['occupation'] = occupation[0].strip()

        rainmaker = c.xpath('.//div[@class="comment-header"]/div')
        if rainmaker:
            item['account'], item['points'] = self.getRainmaker(rainmaker)

        location = c.xpath('.//div[@class="company"]/text()').extract()
        if location:
            l = location[0].split('-')[-1].strip()
            if ',' in l:
                item['city'], item['state'] = l.split(',') if l else (None,
                                                                      None)

        return item
Exemplo n.º 4
0
 def parse_comments(self, response):
     blog = response.meta['blog']
     count = blog['count']
     d = json.loads(response.text)
     for id in d:
         for i in d[id]:
             count += 1
             c = Selector(text=i)
             item = postItem()
             uItem = userItem()
             item['URL'] = blog['URL']
             item['title'] = blog['title']
             item['disPage'] = blog['blogPage']
             item['pid'] = int(
                 c.xpath('//div[@class="blog-comment-comment "]/@data-id').
                 extract()[0])
             item['replyid'] = count
             t = c.xpath(
                 './/meta[@itemprop="datePublished"]/@content').extract()[0]
             item['postTime'] = datetime.strptime(t, '%Y-%m-%dT%H:%M:%S')
             item['replyTo'] = blog['replyid'].get(id)
             # Ignore deleted post
             if not item['replyTo']:
                 continue
             body = c.xpath(
                 './/div[@class="blog-comment-comment-body"]//text()'
             ).extract()
             item['body'] = ''.join(body).strip()
             uid = c.xpath(
                 '//div[@class="blog-comment-comment-details"]/div/@data-id'
             ).extract()[0]
             item['uid'] = uid
             uItem['uid'] = uid
             name = c.xpath(
                 '//div[contains(@class, "agent-tag")]/text()').extract()[0]
             uItem['firstName'], uItem['lastName'] = self.getName(name)
             url = 'http://activerain.com/profile/{0}/mini_vcard'.format(
                 uid)
             request = scrapy.Request(url,
                                      callback=self.parse_mini_card,
                                      dont_filter=True)
             request.meta['item'] = item
             request.meta['uItem'] = uItem
             request.meta['handle_httpstatus_list'] = [404]
             yield request