Exemplo n.º 1
0
    def parse_page(self, response):
        hxs = scrapy.Selector(response)
        log.debug(hxs)

        scripts = hxs.xpath('//script[@type="text/javascript"]')
        for script in scripts:
            js = " ".join(script.xpath('./text()').extract())

            matches = js_re.match(js)

            if matches:
                json_data = json.loads(matches.group(2))

                for post_id, post_data in json_data.iteritems():
                    body = re.sub(
                        ">>\d+", "",
                        html_util.strip_tags(parser.unescape(
                            post_data['com'])))
                    #log.debug(post_data)

                    if 'resto' in post_data and int(post_data['resto']) != 0:
                        url = u"http://boards.4chan.org/{}/thread/{}#p{}".format(
                            post_data['board'], post_data['resto'],
                            post_data['no'])
                    else:
                        url = u"http://boards.4chan.org/{}/thread/{}".format(
                            post_data['board'], post_data['no'])

                    yield items.PostItem(site_id=sites.FOURCHAN,
                                         points=int(post_data['replies'])
                                         if post_data['replies'] else 0,
                                         site_post_id=post_id,
                                         body=body,
                                         sn=post_data['name'],
                                         url=url)
Exemplo n.º 2
0
    def parse_page(self, response):
        data = json.loads(response.body)

        discovery_posts = data['response']['DiscoveryPosts']
        posts = discovery_posts['posts']

        for post in posts:
            try:
                hxs = scrapy.Selector(text=post)

                entry = hxs.xpath('//article[@data-id]')
                if not entry:
                    continue

                post_id = entry.xpath('./@data-id').extract()[0].strip()
                #log.debug(post_id)
                author = entry.xpath(
                    './@data-tumblelog-name').extract()[0].strip()
                #log.debug(author)

                header = entry.xpath('.//header')
                href = util.get_url_from_node(response,
                                              header.xpath('./div/a/@href'))
                #log.debug(href)

                tags = entry.xpath(
                    './/section[@class="post_tags"]/div/a[@class="post_tag"]/@data-tag'
                ).extract()
                #log.debug(tags)

                body = entry.xpath(
                    './/div[@class="post_body"]//text()').extract()
                #log.debug(body)

                body += tags

                body_text = " ".join(parser.unescape(body))
                #log.debug(body_text)

                notes_str = entry.xpath(
                    './/div[@class="post_notes_inner"]//span[@class="note_link_current"]/@data-count'
                ).extract()[0].strip()
                votes = self.parse_notes(notes_str)
                #log.debug(votes)

                yield items.PostItem(site_id=sites.TUMBLR,
                                     points=votes,
                                     site_post_id=post_id,
                                     body=body_text,
                                     sn=author,
                                     url=href)
            except:
                log.exception(u"Failed")
                log.debug(post)
Exemplo n.º 3
0
    def parse_page(self, response):
        hxs = scrapy.Selector(response)
        log.debug(hxs)

        entries = hxs.xpath(
            '//li[contains(@class,"expanded-shelf-content-item-wrapper")]')
        log.debug(entries)
        for thing in entries:
            log.debug(thing)
            item_node = thing.xpath(
                './div[contains(@class, "expanded-shelf-content-item")]')

            post_id = item_node.xpath(
                './div[contains(@class, "yt-lockup-video")]/@data-context-item-id'
            ).extract()[0].strip()

            title_link = item_node.xpath(
                './/div[contains(@class, "yt-lockup-content")]/h3[contains(@class, "yt-lockup-title")]/a'
            )

            title = title_link.xpath('./text()').extract()[0].strip()

            href = util.get_url_from_node(response,
                                          title_link.xpath('./@href'))

            body = " ".join(
                item_node.xpath(
                    './/div[contains(@class, "yt-lockup-content")]/div[contains(@class, "yt-lockup-description")]//text()'
                ).extract()).strip()

            author = item_node.xpath(
                './/div[contains(@class, "yt-lockup-content")]/div[contains(@class, "yt-lockup-byline")]/a/text()'
            ).extract()[0].strip()

            views = 0
            try:
                views_str = item_node.xpath(
                    './/div[contains(@class, "yt-lockup-content")]//ul[contains(@class, "yt-lockup-meta-info")]/li/text()'
                ).extract()[1].strip()
                matches = views_re.match(views_str)
                views = int(matches.group(1).replace(",", ""))
            except:
                log.exception(u"Exception parsing {}".format(thing))

            yield items.PostItem(site_id=sites.YOUTUBE,
                                 points=views,
                                 site_post_id=post_id,
                                 body=body,
                                 sn=author,
                                 url=href)
Exemplo n.º 4
0
    def parse_page(self, response):
        hxs = scrapy.Selector(response)

        entries = hxs.xpath('//div[contains(@class,"postItem")]')
        for thing in entries:
            post_id = thing.xpath('./@data-post-id').extract()[0].strip()

            entry = thing.xpath(
                './/div[contains(@class, "postArticle-content")]')

            title_node = entry.xpath('.//h3')

            title = " ".join(title_node.xpath('.//text()').extract()).strip()

            href = util.get_url_from_node(
                response,
                thing.xpath(
                    './/article[contains(@class, "postArticle")]/a/@href'))

            body_node = entry.xpath(
                './/div[contains(@class, "section-inner")]/p')
            body = title

            if body_node:
                body = " ".join(body_node.xpath('.//text()').extract()).strip()

            header = thing.xpath(
                './/div[contains(@class, "postMeta-previewHeader")]')
            author = header.xpath('.//a[@data-action="show-user-card"]/text()'
                                  ).extract()[0].strip()

            votes = 0
            try:
                votes_str = thing.xpath(
                    './/button[@data-action="show-recommends"]/text()'
                ).extract()[0].strip().replace(",", "")
                votes = self.parse_votes(votes_str)
            except:
                log.exception(u"Exception parsing {}".format(thing))

            yield items.PostItem(site_id=sites.MEDIUM,
                                 points=votes,
                                 site_post_id=post_id,
                                 body=body,
                                 sn=author,
                                 url=href)
Exemplo n.º 5
0
    def parse_page(self, response):
        num = response.meta['num']

        hxs = selector.HtmlXPathSelector(response)

        entries = hxs.xpath('//div[contains(@class,"thing")]')
        for thing in entries:
            post_id = thing.xpath('./@data-fullname').extract()[0].strip()
            entry = thing.xpath('./div[@class="entry unvoted"]')

            href = util.get_url_from_node(
                response, entry.xpath('./p[@class="title"]/a/@href'))
            title = entry.xpath(
                './p[@class="title"]/a/text()').extract()[0].strip()

            author = entry.xpath(
                './p[@class="tagline"]/a/text()').extract()[0].strip()
            votes = 0
            try:
                votes_str = thing.xpath(
                    './div[@class="midcol unvoted"]/div[@class="score unvoted"]/text()'
                ).extract()[0].strip()
                #log.debug(u"Votes string {}".format(votes_str))
                votes = int(votes_str)
            except:
                pass
                #log.exception(u"Exception parsing {}".format(thing))

            yield items.PostItem(site_id=sites.REDDIT,
                                 points=votes,
                                 site_post_id=post_id,
                                 body=title,
                                 sn=author,
                                 url=href)

        if num < 10:
            next_link = hxs.xpath('//a[@rel="nofollow next"]/@href')
            if next_link:
                yield http.Request(util.get_url_from_node(response, next_link),
                                   meta={
                                       'type': 'page',
                                       'num': num + 1
                                   })
Exemplo n.º 6
0
    def parse_page(self, response):
        num = response.meta['num']

        hxs = selector.HtmlXPathSelector(response)

        links = hxs.xpath('//td[@class="title"]/a')
        subtext = hxs.xpath('//td[@class="subtext"]')

        for link, td in zip(links, subtext)[:-1]:
            sn = td.xpath('./a/text()')[0].extract()

            points_text = td.xpath('./span/text()')[0].extract()
            items_href = td.xpath('./a/@href')[1].extract()
            m = points_re.match(points_text)
            points = int(m.group(1))

            m = post_id_re.match(items_href)
            post_id = int(m.group(1))

            post = link.xpath('./text()')[0].extract()
            content_link = util.get_url_from_node(response,
                                                  link.xpath('./@href'))

            yield items.PostItem(site_id=sites.HN,
                                 points=points,
                                 site_post_id=post_id,
                                 body=post,
                                 sn=sn,
                                 url=content_link)

        if num < 5:
            more_link = links[-1]
            more_url = util.get_url_from_node(response,
                                              more_link.xpath('./@href'))

            yield http.Request(more_url, meta={'type': 'page', 'num': num + 1})