Exemplo n.º 1
0
    def page_url(self, tv_listing):
        """
        parses the imdb search url page and retrieves the first listing page url.
        :param read: read object of a resource. This way we can stub it out, in such a way that we can send a url response
        object or file read object
        :return: returns url of the imdb listing page: http://www.imdb.com/title/tt0898266/?ref_=fn_tt_tt_1
        """
        url = self.finder_url(tv_listing)
        with contextlib.closing(urllib.urlopen(url)) as page_response:
            helper = ScrapeHelper(page_response.read())

            if helper.is_table_exists("findList"):
                rows = helper.find_table_by_class("findList")
            else:
                url = self.finder_url(tv_listing, titles=True)
                with contextlib.closing(
                        urllib.urlopen(url)) as page_response_1:
                    helper = ScrapeHelper(page_response_1.read())
                    rows = helper.find_table_by_class("findList")

        top_result = rows[0].find('td').find('a').get('href')
        return self.url + top_result
Exemplo n.º 2
0
    def get_scores(self, read):
        """
        parses IMDB listing page and fetches relevant information
        :param read: read object
        :return: dict of popularity, best_rating, users, reviews, and external_reviews
        """

        helper = ScrapeHelper(read)
        popularity = self.getPopularity(helper)
        popularity_change = self.meter_change(helper)
        score_div = helper.find_div_by_itemtype(
            'http://schema.org/AggregateRating')
        rating = float(
            helper.find_by_itemprop('span', 'ratingValue',
                                    score_div).text.replace(",", ""))
        best_rating = int(
            helper.find_by_itemprop('span', 'bestRating',
                                    score_div).text.replace(",", ""))
        anchors = helper.find_all_anchors(score_div)
        users, reviews, external_reviews = '', '', ''
        for anchor in anchors:
            href = anchor.get('href').split('?')[0]
            count = long(self.getCount(anchor))
            if href == 'ratings':
                users = count
            elif href == 'reviews':
                reviews = count
            elif href == 'externalreviews':
                external_reviews = count

        return {
            'popularity': popularity,
            'popularity_change': popularity_change,
            'rating': rating,
            'best_rating': best_rating,
            'users': users,
            'reviews': reviews,
            'external_reviews': external_reviews
        }
Exemplo n.º 3
0
    def get_scores(self, read):
        """
        fetches raing, bestrating, worstrating, users info from the listing page url
        :param read: read object
        :return: dict
        """
        helper = ScrapeHelper(read)
        by_id = helper.find_by_id('div', 'all-critics-numbers')
        all_critic = {
            'rating':
            self.getCount(helper.ratingValue_in_span(by_id)),
            'bestrating':
            self.getCount(helper.rating_in_meta('bestrating', by_id)),
            'worstrating':
            self.getCount(helper.rating_in_meta('worstrating', by_id)),
            'users':
            self.getCount(
                helper.rating_in_meta('reviewCount ratingCount', by_id))
        }

        audience_score = helper.find_by_class('div', 'audience-score meter')
        avg_audience = {
            'rating':
            self.getCount(helper.ratingValue_in_span(audience_score)),
            'best_rating':
            self.getCount(helper.rating_in_meta('bestrating', audience_score)),
            'worst_rating':
            self.getCount(helper.rating_in_meta('worstrating',
                                                audience_score)),
            'users':
            self.getCount(helper.rating_in_meta('ratingCount', audience_score))
        }
        time.sleep(1)
        return {'all_critic': all_critic, 'avg_audience': avg_audience}
Exemplo n.º 4
0
 def parse_search_page(self, read, type_):
     helper = ScrapeHelper(read)
     by_id = helper.find_by_id('ul', type_ + '_results_ul')
     return self.url + helper.find_all_anchors(by_id)[0].get('href')
Exemplo n.º 5
0
    def get_scores(self, read):
        """
        fetches raing, bestrating, worstrating, users info from the listing page url
        :param read: read object
        :return: dict
        """
        helper = ScrapeHelper(read)
        by_id = helper.find_by_id("div", "all-critics-numbers")
        all_critic = {
            "rating": self.getCount(helper.ratingValue_in_span(by_id)),
            "bestrating": self.getCount(helper.rating_in_meta("bestrating", by_id)),
            "worstrating": self.getCount(helper.rating_in_meta("worstrating", by_id)),
            "users": self.getCount(helper.rating_in_meta("reviewCount ratingCount", by_id)),
        }

        audience_score = helper.find_by_class("div", "audience-score meter")
        avg_audience = {
            "rating": self.getCount(helper.ratingValue_in_span(audience_score)),
            "best_rating": self.getCount(helper.rating_in_meta("bestrating", audience_score)),
            "worst_rating": self.getCount(helper.rating_in_meta("worstrating", audience_score)),
            "users": self.getCount(helper.rating_in_meta("ratingCount", audience_score)),
        }
        time.sleep(1)
        return {"all_critic": all_critic, "avg_audience": avg_audience}
Exemplo n.º 6
0
 def parse_search_page(self, read, type_):
     helper = ScrapeHelper(read)
     by_id = helper.find_by_id("ul", type_ + "_results_ul")
     return self.url + helper.find_all_anchors(by_id)[0].get("href")
Exemplo n.º 7
0
 def test_meter_change_descend(self):
     helper = ScrapeHelper(
         open(
             '/home/sarath/start-internet-idea/tvguide/resources/imdb_listing_with_popularity_descend.html',
             'r').read())
     print(self.imdb.meter_change(helper))