示例#1
0
def test_get_movielist_movie_attributes():
    crit_scraper = CritickerScraper()
    raw_html = read_file(
        'test/fixtures/criticker-normal-movie-in-movie-list.html')
    html_info = BeautifulSoup(raw_html, "lxml").find('li')
    movie_info = crit_scraper.get_movielist_movie_attributes(html_info)
    assert set(movie_info.keys()) == {
        'crit_id', 'crit_url', 'title', 'year', 'date_added', 'my_ratings'
    }
    assert movie_info['my_ratings']['tijl'] == {'psi': 55}
    assert movie_info['crit_id'] == 26496
    assert movie_info[
        'crit_url'] == 'https://www.criticker.com/film/Issiz-adam/'
    assert movie_info['title'] == 'Issiz adam'
    assert movie_info['year'] == 2008
    assert 'date_added' in movie_info
    assert arrow.get(movie_info['date_added']).humanize() == 'just now'

    raw_html = read_file(
        'test/fixtures/criticker-rated-movie-in-movie-list.html')
    html_info = BeautifulSoup(raw_html, "lxml").find('li')
    movie_info = crit_scraper.get_movielist_movie_attributes(html_info)
    assert set(movie_info.keys()) == {
        'crit_id', 'crit_url', 'title', 'year', 'date_added', 'my_ratings'
    }
    assert movie_info['my_ratings']['tijl'] == {'rating': 61}

    raw_html = read_file(
        'test/fixtures/criticker-nopsi-movie-in-movie-list.html')
    html_info = BeautifulSoup(raw_html, "lxml").find('li')
    movie_info = crit_scraper.get_movielist_movie_attributes(html_info)
    assert set(movie_info.keys()) == {
        'crit_id', 'crit_url', 'title', 'year', 'date_added'
    }
示例#2
0
def test_fibonacci():
    crit_scraper = CritickerScraper()
    assert crit_scraper.fibonacci(0) == 0
    assert crit_scraper.fibonacci(1) == 1
    assert crit_scraper.fibonacci(2) == 1
    assert crit_scraper.fibonacci(3) == 2
    assert crit_scraper.fibonacci(5) == 5
示例#3
0
def test_get_movie_info_no_votes():
    crit_scraper = CritickerScraper()
    with requests_mock.mock() as m:
        m.get('https://www.criticker.com/film/16-Fathoms-Deep/',
              text=read_file('test/fixtures/criticker-16-fathoms-deep.html'))
        movie_info = crit_scraper.get_movie_info(
            'https://www.criticker.com/film/16-Fathoms-Deep/')
    assert movie_info.get('crit_votes') == 0
示例#4
0
def test_get_movie_info_no_poster():
    crit_scraper = CritickerScraper()
    with requests_mock.mock() as m:
        m.get('https://www.criticker.com/film/8-Tire-on-the-Ice/',
              text=read_file('test/fixtures/criticker-8-tire-on-the-ice.html'))
        movie_info = crit_scraper.get_movie_info(
            'https://www.criticker.com/film/8-Tire-on-the-Ice/')
    assert movie_info.get('poster_url') is None
示例#5
0
def test_get_movie_info_no_rating_of_my_own():
    crit_scraper = CritickerScraper()
    with requests_mock.mock() as m:
        m.get('http://www.criticker.com/film/The-Mask/',
              text=read_file('test/fixtures/criticker-the-mask.html'))
        movie_info = crit_scraper.get_movie_info(
            'http://www.criticker.com/film/The-Mask/')
    assert movie_info['my_ratings']['tijl'].get('rating') is None
示例#6
0
def test_get_movie_info_no_trailer():
    crit_scraper = CritickerScraper()
    with requests_mock.mock() as m:
        m.get('http://www.criticker.com/film/Daens/',
              text=read_file('test/fixtures/criticker-daens.html'))
        movie_info = crit_scraper.get_movie_info(
            'http://www.criticker.com/film/Daens/')
    assert movie_info['trailer_url'] is None
示例#7
0
def test_get_movie_list_popularity_page(mocker):
    crit_scraper = CritickerScraper()
    movies, nr_pages = crit_scraper.get_movie_list_page(
        'https://www.criticker.com/films/?filter=n9zp9zf2000zor&p=1',
        pagenr=1,
        popularity=9)
    assert nr_pages == 6
    assert len(movies) == 5
    assert movies[0] == {'crit_id': 1}
示例#8
0
def test_get_movie_list_html():
    crit_scraper = CritickerScraper()
    with requests_mock.mock() as m:
        m.get('https://www.criticker.com/films/?filter=or&view=all',
              text=read_file('test/fixtures/criticker-movie-list.html'))
        movie_list, nr_pages = crit_scraper.get_movie_list_html(
            'https://www.criticker.com/films/?filter=or&view=all')
    assert nr_pages == 2283
    assert len(movie_list) == 60
    assert isinstance(movie_list, ResultSet)
    assert isinstance(movie_list[0], Tag)
示例#9
0
def test_get_movies_of_popularity(mocker):
    crit_scraper = CritickerScraper()
    movies = crit_scraper.get_movies_of_popularity(popularity=8, min_year=2000)
    assert movies == [1, 2, 3, 1, 2, 3, 1, 2, 3]
    assert crit_scraper.get_movie_list_popularity_page.call_count == 4
    assert crit_scraper.get_movie_list_popularity_page.call_args_list[0][
        1] == {
            'min_year': 2000,
            'popularity': 8
        }
    assert crit_scraper.get_movie_list_popularity_page.call_args_list[1][1] == \
           {'min_year': 2000, 'popularity': 8, 'pagenr': 1}
示例#10
0
def test_get_movies(mocker):
    create_test_tables()
    db = MySQLDatabase(schema='qmdb_test', env='test')
    crit_scraper = CritickerScraper()
    crit_scraper.get_movies(db, start_popularity=8)
    save_movies_call_args = db.save_movies.call_args_list[0][0]
    assert save_movies_call_args[0] == [1, 1, 1]
    assert crit_scraper.get_movies_of_popularity.call_args_list[0][1] ==\
           {'debug': False, 'min_year': 2013, 'popularity': 10}
    assert crit_scraper.get_movies_of_popularity.call_args_list[1][1] ==\
           {'debug': False, 'min_year': 2016, 'popularity': 9}
    assert crit_scraper.get_movies_of_popularity.call_args_list[2][1] ==\
           {'debug': False, 'min_year': 2018, 'popularity': 8}
    remove_test_tables(db)
示例#11
0
def test_get_movie_info():
    crit_scraper = CritickerScraper()
    with requests_mock.mock() as m:
        m.get('http://www.criticker.com/film/The-Matrix/',
              text=read_file('test/fixtures/criticker-the-matrix.html'))
        movie_info = crit_scraper.get_movie_info(
            'http://www.criticker.com/film/The-Matrix/')
    assert movie_info[
        'poster_url'] == 'https://www.criticker.com/img/films/posters/The-Matrix.jpg'
    assert movie_info['imdbid'] == 133093
    assert movie_info['crit_rating'] == pytest.approx(7.71, 0.3)
    assert movie_info['crit_votes'] == pytest.approx(27493, 1000)
    assert movie_info['my_ratings']['tijl']['rating'] == 93
    assert movie_info['my_ratings']['tijl']['psi'] == pytest.approx(80, 10)
    assert movie_info[
        'trailer_url'] == 'https://www.youtube.com/watch?v=vKQi3bBA1y8'
示例#12
0
文件: updater.py 项目: tijlk/qmdb
 def __init__(self, **kwargs):
     self.sources = [
         'criticker', 'omdb', 'imdb_main', 'imdb_release',
         'imdb_metacritic', 'imdb_keywords', 'imdb_taglines',
         'imdb_vote_details', 'imdb_plot', 'ptp'
     ]
     self.multipliers = {
         'base_multiplier': 1,
         'base_multiplier_criticker': 1,
         'base_multiplier_omdb': 10,
         'base_multiplier_imdb_main': 1,
         'base_multiplier_imdb_release': 10,
         'base_multiplier_imdb_metacritic': 5,
         'base_multiplier_imdb_keywords': 10,
         'base_multiplier_imdb_taglines': 10,
         'base_multiplier_imdb_vote_details': 2,
         'base_multiplier_imdb_plot': 10,
         'base_multiplier_ptp': 1,
         'firsttime_speedup': 50000
     }
     self.multipliers.update(kwargs)
     for source in self.sources:
         self.multipliers['multiplier_' + source] = \
             self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier']
         self.multipliers['multiplier_' + source + '_firsttime'] = \
             self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] / \
             self.multipliers['firsttime_speedup']
     self.crit_scraper = CritickerScraper()
     self.omdb_scraper = OMDBScraper()
     self.imdb_scraper = IMDBScraper()
     self.ptp_scraper = PassThePopcornScraper()
     self.years = None
     self.crit_pop = None
     self.earliest_date_added = None
     self.max_connections_per_hour = {
         'criticker': 400,
         'omdb': 40,
         'imdb': 800,
         'ptp': 400
     }
示例#13
0
文件: updater.py 项目: tijlk/qmdb
class Updater(object):
    def __init__(self, **kwargs):
        self.sources = [
            'criticker', 'omdb', 'imdb_main', 'imdb_release',
            'imdb_metacritic', 'imdb_keywords', 'imdb_taglines',
            'imdb_vote_details', 'imdb_plot', 'ptp'
        ]
        self.multipliers = {
            'base_multiplier': 1,
            'base_multiplier_criticker': 1,
            'base_multiplier_omdb': 10,
            'base_multiplier_imdb_main': 1,
            'base_multiplier_imdb_release': 10,
            'base_multiplier_imdb_metacritic': 5,
            'base_multiplier_imdb_keywords': 10,
            'base_multiplier_imdb_taglines': 10,
            'base_multiplier_imdb_vote_details': 2,
            'base_multiplier_imdb_plot': 10,
            'base_multiplier_ptp': 1,
            'firsttime_speedup': 50000
        }
        self.multipliers.update(kwargs)
        for source in self.sources:
            self.multipliers['multiplier_' + source] = \
                self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier']
            self.multipliers['multiplier_' + source + '_firsttime'] = \
                self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] / \
                self.multipliers['firsttime_speedup']
        self.crit_scraper = CritickerScraper()
        self.omdb_scraper = OMDBScraper()
        self.imdb_scraper = IMDBScraper()
        self.ptp_scraper = PassThePopcornScraper()
        self.years = None
        self.crit_pop = None
        self.earliest_date_added = None
        self.max_connections_per_hour = {
            'criticker': 400,
            'omdb': 40,
            'imdb': 800,
            'ptp': 400
        }

    def update_movies(self, db, n=None, weibull_lambda=1.5):
        self.get_movies_stats(db)
        updates = self.get_all_next_updates(db, weibull_lambda=weibull_lambda)
        crit_updates = self.get_source_update_sequence(updates, 'criticker')
        omdb_updates = self.get_source_update_sequence(updates, 'omdb')
        imdb_updates = self.get_source_update_sequence(updates, 'imdb')
        ptp_updates = self.get_source_update_sequence(updates, 'ptp')
        sorted_seq = sorted(crit_updates + omdb_updates + imdb_updates +
                            ptp_updates,
                            key=itemgetter('next_update'))
        if n is not None:
            sources_to_update = sorted_seq[:n]
        else:
            sources_to_update = sorted_seq
        for i, source_to_update in enumerate(sources_to_update):
            time_to_sleep = max(1, (source_to_update['next_update'] -
                                    arrow.now()).total_seconds())
            last_updated = getattr(db.movies[source_to_update['crit_id']],
                                   source_to_update['source'] + '_updated')
            crit_popularity = db.movies[
                source_to_update['crit_id']].crit_popularity
            if crit_popularity is None:
                crit_popularity = 5
            print(
                "{}: Updating {} info for '{}' ({}, popularity {:.1f}) {}. Last updated {}."
                .format(arrow.now().format('HH:mm:ss'),
                        source_to_update['source'],
                        db.movies[source_to_update['crit_id']].title,
                        db.movies[source_to_update['crit_id']].year,
                        crit_popularity,
                        arrow.now().shift(seconds=time_to_sleep).humanize(),
                        humanized_time(last_updated)))
            time.sleep(time_to_sleep)
            self.update_source(db, source_to_update)
        return

    def get_movies_stats(self, db):
        years_numbers = [
            db.movies[crit_id].get_floating_release_year()
            for crit_id in db.movies
        ]
        years = {
            'min': np.min(years_numbers),
            'median': np.median(years_numbers),
            'max': np.max(years_numbers)
        }
        years['b_parameter'] = self.b_parameter(years['max'] - years['median'],
                                                years['max'] - years['min'])
        years['a_parameter'] = self.a_parameter(years['max'] - years['median'],
                                                years['b_parameter'])
        crit_pop_nrs = [
            db.movies[crit_id].crit_popularity for crit_id in db.movies
            if db.movies[crit_id].crit_popularity is not None
        ]
        crit_pop = {
            'min': np.min(crit_pop_nrs),
            'median': np.median(crit_pop_nrs),
            'max': np.max(crit_pop_nrs)
        }
        crit_pop['b_parameter'] = self.b_parameter(
            crit_pop['max'] - crit_pop['median'],
            crit_pop['max'] - crit_pop['min'])
        crit_pop['a_parameter'] = self.a_parameter(
            crit_pop['max'] - crit_pop['median'], crit_pop['b_parameter'])
        self.years = years
        self.crit_pop = crit_pop
        self.earliest_date_added = np.min(
            [db.movies[crit_id].date_added for crit_id in db.movies])

    @staticmethod
    def b_parameter(median_feature,
                    max_feature,
                    median_period=6,
                    max_period=36):
        return np.log(np.log(max_period) / np.log(median_period)) / np.log(
            max_feature / median_feature)

    @staticmethod
    def a_parameter(median_feature, b_parameter, median_period=8):
        return np.log(median_period) / np.power(median_feature, b_parameter)

    def get_all_next_updates(self, db, weibull_lambda=1.5):
        seq = []
        for crit_id, movie in db.movies.items():
            updates = self.calculate_next_updates(
                movie, weibull_lambda=weibull_lambda)
            seq += list(updates)
        now = arrow.now()
        seq = [u for u in seq if u['next_update'] <= now]
        return seq

    def calculate_next_updates(self, movie, weibull_lambda=1.5):
        year_period_score = self.calculate_period_score(
            self.years['max'] - movie.year, self.years)
        crit_popularity = self.crit_pop[
            'median'] if movie.crit_popularity is None else movie.crit_popularity
        crit_pop_period_score = self.calculate_period_score(
            self.crit_pop['max'] - crit_popularity, self.crit_pop)
        base_update_period = self.calculate_update_period(
            year_period_score, crit_pop_period_score)

        update_periods = {}
        for source in self.sources:
            if (source in ('omdb', 'ptp')
                    or source.startswith('imdb')) and movie.imdbid is None:
                break
            update_periods[source] = dict()
            update_periods[source]['source'] = source
            update_periods[source]['crit_id'] = movie.crit_id
            update_periods[source][
                'update_period'] = base_update_period * self.multipliers[
                    'multiplier_' + source]
            update_periods[source]['update_period_firsttime'] = \
                base_update_period * self.multipliers['multiplier_' + source + '_firsttime']
            next_update, period = self.calculate_next_update(
                getattr(movie, source + '_updated'),
                update_periods[source]['update_period'],
                update_periods[source]['update_period_firsttime'],
                weibull_lambda=weibull_lambda)
            update_periods[source]['next_update'] = next_update
            update_periods[source]['actual_update_period'] = period
        return list(update_periods.values())

    @staticmethod
    def calculate_period_score(feature, stats):
        return np.exp(stats['a_parameter'] *
                      np.power(feature, stats['b_parameter']))

    @staticmethod
    def calculate_update_period(year_period_score,
                                crit_pop_period_score,
                                year_power=2,
                                crit_pop_power=1):
        period = np.exp((year_power * np.log(year_period_score) +
                         crit_pop_power * np.log(crit_pop_period_score)) /
                        (year_power + crit_pop_power))
        return period

    def calculate_next_update(self,
                              date_updated,
                              period,
                              firsttime_period,
                              weibull_lambda=1.5,
                              min_period=500):
        weibull = (np.random.weibull(weibull_lambda, 1) /
                   np.power(np.log(2), 1 / weibull_lambda))[0]
        if date_updated is None:
            next_update = self.earliest_date_added.shift(
                weeks=min(firsttime_period * weibull, min_period))
            return next_update, min(firsttime_period, min_period)
        else:
            next_update = date_updated.shift(weeks=min(period *
                                                       weibull, min_period))
            return next_update, min(period, min_period)

    def get_source_update_sequence(self, all_updates, source):
        updates = [u for u in all_updates if u['source'].startswith(source)]
        sorted_updates = sorted(updates, key=itemgetter('next_update'))
        if len(sorted_updates) > 0:
            uph = min(len(sorted_updates),
                      self.max_connections_per_hour[source])
            print("Updates needed per hour for {}: {:.0f}".format(source, uph))
            update_intervals = np.random.exponential(3600 / uph,
                                                     len(sorted_updates))
            for i, u in enumerate(sorted_updates):
                if i > 0:
                    sorted_updates[i]['next_update'] = sorted_updates[
                        i -
                        1]['next_update'].shift(seconds=update_intervals[i])
                else:
                    sorted_updates[i]['next_update'] = arrow.now().shift(
                        seconds=update_intervals[i])
        return sorted_updates

    def update_source(self, db, source_to_update):
        movie = db.movies[source_to_update['crit_id']]
        if source_to_update['source'] == 'criticker':
            movie = self.crit_scraper.refresh_movie(movie)
        elif source_to_update['source'] == 'omdb':
            movie = self.omdb_scraper.refresh_movie(movie)
        elif source_to_update['source'].startswith('imdb'):
            infoset = re.search(r'imdb_(.*)',
                                source_to_update['source']).groups()[0]
            movie = self.imdb_scraper.refresh_movie(movie, infoset=infoset)
        elif source_to_update['source'] == 'ptp':
            movie = self.ptp_scraper.refresh_movie(movie)
        if movie is not None:
            db.set_movie(movie)

    def update_movie_completely(self, db, imdbid):
        movie = [
            db.movies[crit_id] for crit_id in db.movies
            if db.movies[crit_id].imdbid == imdbid
        ][0]
        movie = self.crit_scraper.refresh_movie(movie)
        movie = self.omdb_scraper.refresh_movie(movie)
        movie = self.imdb_scraper.refresh_movie(movie, infoset='main')
        movie = self.ptp_scraper.refresh_movie(movie)
        if movie is not None:
            db.set_movie(movie)
示例#14
0
def test_get_year_from_movielist_title():
    crit_scraper = CritickerScraper()
    movielist_title = 'The Matrix (1999)'
    assert crit_scraper.get_year_from_movielist_title(movielist_title) == 1999
示例#15
0
def test_config_cookies():
    crit_scraper = CritickerScraper()
    assert isinstance(crit_scraper.cookies, dict)
    assert 'uid2' in crit_scraper.cookies
示例#16
0
文件: main.py 项目: tijlk/qmdb
from qmdb.database.database import MySQLDatabase
from qmdb.interfaces.omdb import OMDBScraper
from qmdb.interfaces.criticker import CritickerScraper
from qmdb.interfaces.updater import Updater
from qmdb.model.predictions import RatingModeler
from qmdb.interfaces.netflix import NetflixScraper
import time

if __name__ == "__main__":
    db = MySQLDatabase(from_scratch=False)
    omdb_scraper = OMDBScraper()
    crit_scraper = CritickerScraper(user='******')
    updater = Updater()
    modeler = RatingModeler(db)
    netflix_scraper = NetflixScraper(db)

    while True:
        print("\nRefreshing movie information from Criticker, IMDb and OMDB\n")
        time0 = time.time()
        while time.time() - time0 <= 12 * 3600:
            updater.update_movies(db, n=30, weibull_lambda=3)
        crit_scraper.get_movies(db, start_popularity=2)
        netflix_scraper.get_genre_ids()
        netflix_scraper.get_movies_for_genres()
        crit_scraper.get_ratings(db)
        modeler.get_predictions()