Пример #1
0
def get_top_box_office_by_year(year, number, debug=False):
    """ Pull out the 'number' highest-grosing
        movies of the year.
    """
    NUM_MOVIES_PER_PAGE = 50

    sort = 'boxoffice_gross_us'

    def get_website(start, year):
        website = 'http://www.imdb.com/search/title?at=0&sort=%s&start=%s&title_type=feature&year=%s,%s' % (
            sort, start, year, year)
        return website

    n = 1

    ret_list = OrderedDict()

    while n < number:
        print 'n=%s/%s' % (n, number)
        url_page = get_website(start=n, year=year)

        print url_page
        n += NUM_MOVIES_PER_PAGE

        # I don't get why, but IMDB barfs when I specify a user agent???
        soup = get_soup(url_page, no_user_agent=True)

        # Match on <td class="number">, which refers to the ranking of the movie
        all_movies = soup.findAll('td', **{'class': "number"})

        for movie in all_movies:
            title_part = movie.next.next.next.next.next.next.next.next.next.next.next.next.next

            movie_name = clean_unicode(title_part.next)

            link = str(title_part['href'])
            m = re.match('/title/tt(\d+)/', link)
            groups = m.groups()
            assert len(groups) == 1
            imdb_movie_id = int(groups[0])

            _year = title_part.next.next.next.next
            m = re.match(r'\((\d+)\)', _year)
            groups = m.groups()
            assert len(groups) == 1
            year = int(groups[0])

            ret_list[imdb_movie_id] = movie_name

            # if only a few movies are requested
            if len(ret_list) == number:
                return ret_list

    return ret_list
Пример #2
0
    def scrape_main_page(self):

        self.main_page_url = self.get_main_page_url(self.imdb_movie_id)
        self.main_page_soup = get_soup(self.main_page_url)

        self.scrape_title()
        self.scrape_nreviews()
        self.scrape_release_date()
        self.scrape_budget()
        self.scrape_gross()
        self.scrape_description()
        self.get_posters()
Пример #3
0
def get_top_box_office_by_year(year, number, debug=False):
    """ Pull out the 'number' highest-grosing
        movies of the year.
    """
    NUM_MOVIES_PER_PAGE=50

    sort='boxoffice_gross_us'

    def get_website(start,year):
        website='http://www.imdb.com/search/title?at=0&sort=%s&start=%s&title_type=feature&year=%s,%s' % (sort,start,year,year)
        return website

    n=1

    ret_list=OrderedDict()

    while n<number:
        print 'n=%s/%s' % (n,number)
        url_page = get_website(start=n,year=year)

        print url_page
        n+=NUM_MOVIES_PER_PAGE

        # I don't get why, but IMDB barfs when I specify a user agent???
        soup=get_soup(url_page,no_user_agent=True)

        # Match on <td class="number">, which refers to the ranking of the movie
        all_movies=soup.findAll('td',**{'class':"number"})

        for movie in all_movies:
            title_part=movie.next.next.next.next.next.next.next.next.next.next.next.next.next

            movie_name=clean_unicode(title_part.next)

            link=str(title_part['href'])
            m=re.match('/title/tt(\d+)/',link)
            groups=m.groups()
            assert len(groups)==1
            imdb_movie_id=int(groups[0])

            _year=title_part.next.next.next.next
            m=re.match(r'\((\d+)\)',_year)
            groups=m.groups()
            assert len(groups)==1
            year=int(groups[0])

            ret_list[imdb_movie_id]=movie_name

            # if only a few movies are requested
            if len(ret_list) == number:
                return ret_list

    return ret_list
Пример #4
0
    def scrape_main_page(self):

        self.main_page_url = self.get_main_page_url(self.imdb_movie_id)
        self.main_page_soup = get_soup(self.main_page_url)

        self.scrape_title()
        self.scrape_nreviews()
        self.scrape_release_date()
        self.scrape_budget()
        self.scrape_gross()
        self.scrape_description()
        self.get_posters()
Пример #5
0
    def get_reviews_from_page(self,imdb_review_url):

        soup = get_soup(imdb_review_url)

        # find all reviews on the page
        # The easiest way si to match on user avatars:
        all_reviews_html = soup.findAll('img',**{'class':"avatar"})

        all_reviews = []
        for i in all_reviews_html:
            try:
                all_reviews.append(self.get_review_from_page(i,imdb_review_url))
            except:
                print 'Error Reading in review on page %s' % imdb_review_url
                if self.debug: traceback.print_exc()
        return all_reviews
Пример #6
0
    def get_reviews_from_page(self, imdb_review_url):

        soup = get_soup(imdb_review_url)

        # find all reviews on the page
        # The easiest way si to match on user avatars:
        all_reviews_html = soup.findAll('img', **{'class': "avatar"})

        all_reviews = []
        for i in all_reviews_html:
            try:
                all_reviews.append(
                    self.get_review_from_page(i, imdb_review_url))
            except:
                print 'Error Reading in review on page %s' % imdb_review_url
                if self.debug: traceback.print_exc()
        return all_reviews
Пример #7
0
def _get_movie_list(url):

    soup = get_soup(url)
    votes=soup.find(text='Votes')
    current_movie=votes.next.next.next.next.next.next.next.next.next.next.next

    movies=[current_movie]
    for i in range(99):
        current_movie=current_movie.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next
        movies.append(current_movie)

    ret=OrderedDict()

    imdb_movie_id=[]
    ranking=[]

    for i,movie in enumerate(movies):
        m=re.match('/title/tt(\d+)/',movie['href'])
        imdb_movie_id.append(int(m.groups()[0]))
        ranking.append(i)

    return DataFrame({'imdb_movie_id':imdb_movie_id,'ranking':ranking})
Пример #8
0
def _get_movie_list(url):

    soup = get_soup(url)
    votes = soup.find(text='Votes')
    current_movie = votes.next.next.next.next.next.next.next.next.next.next.next

    movies = [current_movie]
    for i in range(99):
        current_movie = current_movie.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next.next
        movies.append(current_movie)

    ret = OrderedDict()

    imdb_movie_id = []
    ranking = []

    for i, movie in enumerate(movies):
        m = re.match('/title/tt(\d+)/', movie['href'])
        imdb_movie_id.append(int(m.groups()[0]))
        ranking.append(i)

    return DataFrame({'imdb_movie_id': imdb_movie_id, 'ranking': ranking})
Пример #9
0
def nreviews_on_page(imdb_movie_id, debug=False):
    """ This static function is necessary for caching. """
    main_page_url=IMDBScraper.get_main_page_url(imdb_movie_id)
    main_page_soup = get_soup(main_page_url)
    return IMDBScraper._scrape_nreviews(main_page_soup,imdb_movie_id,debug)
Пример #10
0
def nreviews_on_page(imdb_movie_id, debug=False):
    """ This static function is necessary for caching. """
    main_page_url = IMDBScraper.get_main_page_url(imdb_movie_id)
    main_page_soup = get_soup(main_page_url)
    return IMDBScraper._scrape_nreviews(main_page_soup, imdb_movie_id, debug)