예제 #1
0
    def parse_episode_data(self, response):
        episodeDataSelector = HtmlXPathSelector(response)

        dataInitial = response.meta['item']
        data = imdbItem()
        data['link'] = response.url.strip()
        data["seriesRating"] = dataInitial["seriesRating"][0].strip()
        data['showName'] = episodeDataSelector.xpath(
            '//h2[@class="tv_header"]/a/text()').extract()[0].strip()
        data['episode'] = episodeDataSelector.xpath(
            '//h2[@class="tv_header"]//span[@class="nobr"]/text()').extract(
            )[0].strip()
        data['episodeRating'] = episodeDataSelector.xpath(
            '//span[@itemprop="ratingValue"]/text()').extract()[0].strip()
        data['votes'] = serializeToInt(
            episodeDataSelector.xpath(
                '//span[@itemprop="ratingCount"]/text()').extract()[0].strip())
        data['genre'] = episodeDataSelector.xpath(
            '//span[@itemprop="genre"]/text()').extract()[0].strip()
        data['director'] = episodeDataSelector.xpath(
            '//div[@itemprop="director"]//span[@itemprop="name"]/text()'
        ).extract()[0].strip()
        data['airDate'] = process_date(
            episodeDataSelector.xpath(
                '//div[@id="title-overview-widget"]//h1[@class="header"]//span[@class="nobr"]/text()'
            ).extract()[0].strip())
        data['videoLink'] = process_link(data['showName'], data['episode'])
        return data
예제 #2
0
    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)

        data = imdbItem()
        data["seriesRating"] = hxs.xpath(
            '//span[@itemprop="ratingValue"]/text()').extract()
        seasonLink = hxs.xpath(
            '//div[@id="titleTVSeries"]/div[1]//span[@class="see-more inline"]/a/@href'
        ).extract()

        #Directly go to ratings page
        '''
        if not seasonLink==[]:
            #print data["link"]
            url = data["link"][0]+'epdate'
            request = Request(url,callback=self.parse_episode_ratings)
            request.meta['item'] = data
            yield request
        '''

        #follow season links - can get more data as opposed to above method
        if not seasonLink == []:
            for season in seasonLink:
                link = 'http://www.imdb.com/' + season
                request = Request(link, callback=self.parse_season_links)
                request.meta['item'] = data
                yield request
예제 #3
0
 def parse_items(self, response):
     hxs = Selector(response)
     print "came here"
     
     data = imdbItem()
     data["seriesRating"] = hxs.xpath('//span[@itemprop="ratingValue"]/text()').extract()
     print data["seriesRating"]
     seasonLink = hxs.xpath('//*[@id="title-episode-widget"]/div/div[3]/a/@href').extract()
     print seasonLink
     #Directly go to ratings page
     '''
     if not seasonLink==[]:
         #print data["link"]
         url = data["link"][0]+'epdate'
         request = Request(url,callback=self.parse_episode_ratings)
         request.meta['item'] = data
         yield request
     '''    
 
     #follow season links - can get more data as opposed to above method
     if not seasonLink==[]:
         for season in seasonLink:
             link = 'http://www.imdb.com/'+season
             request = Request(link,callback=self.parse_season_links)
             request.meta['item'] = data
             yield request
예제 #4
0
 def parse_episode_data(self,response):
     episodeDataSelector = Selector(response)
     
     dataInitial = response.meta['item']
     data = imdbItem()
     data['link'] = response.url.strip()
     data["seriesRating"] = dataInitial["seriesRating"][0].strip()
     data['showName'] = episodeDataSelector.xpath('//h2[@class="tv_header"]/a/text()').extract()[0].strip()
     data['episode'] = episodeDataSelector.xpath('//h2[@class="tv_header"]//span[@class="nobr"]/text()').extract()[0].strip()
     data['episodeRating'] = episodeDataSelector.xpath('//span[@itemprop="ratingValue"]/text()').extract()[0].strip()
     data['votes'] = serializeToInt(episodeDataSelector.xpath('//span[@itemprop="ratingCount"]/text()').extract()[0].strip())
     data['genre'] = episodeDataSelector.xpath('//span[@itemprop="genre"]/text()').extract()[0].strip()
     data['director'] = episodeDataSelector.xpath('//div[@itemprop="director"]//span[@itemprop="name"]/text()').extract()[0].strip()
     data['airDate'] = process_date(episodeDataSelector.xpath('//div[@id="title-overview-widget"]//h1[@class="header"]//span[@class="nobr"]/text()').extract()[0].strip())
     data['videoLink'] = process_link(data['showName'], data['episode'])
     return data
예제 #5
0
    def parse_episode_ratings(self, response):
        hxs = HtmlXPathSelector(response)

        ratingsData = []
        ratingsRawData = hxs.xpath('//td[@align="right"]/text()').extract()
        dataInitial = response.meta['item']
        for episode, rating, votes in grouped(ratingsRawData, 3):
            data = imdbItem()
            data["title"] = dataInitial["title"]
            data["link"] = dataInitial["link"]
            data["seriesRating"] = dataInitial["seriesRating"]
            data["episode"] = episode.replace(u'\xa0', u'')
            data["episodeRating"] = rating
            data["votes"] = votes
            ratingsData.append(data)

        return ratingsData
예제 #6
0
    def parse_episode_ratings(self,response):
        hxs = Selector(response)
    
        ratingsData = []
        ratingsRawData = hxs.xpath('//td[@align="right"]/text()').extract()
        dataInitial = response.meta['item']
        for episode,rating,votes in grouped(ratingsRawData, 3):
            data = imdbItem()
            data["title"] = dataInitial["title"]
            data["link"] = dataInitial["link"]
            data["seriesRating"] = dataInitial["seriesRating"]            
            data["episode"] = episode.replace(u'\xa0', u'')
            data["episodeRating"] = rating
            data["votes"] = votes
            ratingsData.append(data)
            

        return ratingsData