예제 #1
0
def proc_Douban(df_Douban_raw, df_MovieLens, source=''):
    lists_Douban = df_Douban_raw.values.tolist()
    n = 1
    for a_movie_list in lists_Douban:
        movie_api = Movie()
        movie_api.wait_on_rate_limit = True
        while True:
            try:
                search = movie_api.search(a_movie_list[1])
                break
            except BaseException as error:
                print(error)
                time.sleep(0.25)
        print(n)
        tmdbId = 0
        for res in search:
            if ('release_date' not in res.__dict__) or (
                    res.release_date.split('-')[0] != a_movie_list[2]):
                continue
            else:
                tmdbId = res.id
                name = res.title
                break
        if tmdbId == 0 and len(search):
            tmdbId = search[0].id
            name = search[0].title
        a_movie_list[1] = name
        a_movie_list_extension = get_movie_details(tmdbId, df_MovieLens)
        a_movie_list += a_movie_list_extension
    df_Douban = pd.DataFrame(lists_Douban,
                             columns=[
                                 'rank', 'name', 'year', 'Douban_rating',
                                 'country', 'genres', 'budget', 'revenue',
                                 'keywords', 'MLens_rating', 'tmdb_id'
                             ])
    if source == 'test':
        df_Douban.to_csv('./data/Douban_top_250_test.csv')
        print('Douban testing dataset processed')
    else:
        df_Douban.to_csv('./data/Douban_top_250.csv')
        print('Complete Douban dataset processed')
    return df_Douban
예제 #2
0
def proc_Yahoo(df_Yahoo_raw, df_MovieLens, source=''):
    lists_Yahoo = df_Yahoo_raw.values.tolist()
    for a_movie_list in lists_Yahoo:
        movie_api = Movie()
        movie_api.wait_on_rate_limit = True
        while True:
            try:
                search = movie_api.search(a_movie_list[1])
                break
            except BaseException as error:
                print(error)
                time.sleep(0.25)
        tmdbId = 0
        for res in search:
            if ('release_date' not in res.__dict__) or (
                    res.release_date.split('-')[0] != a_movie_list[2]):
                continue
            else:
                tmdbId = res.id
                break
        if tmdbId == 0 and len(search):
            tmdbId = search[0].id
        a_movie_list_extension = get_movie_details(tmdbId, df_MovieLens)
        a_movie_list += a_movie_list_extension
    df_Yahoo = pd.DataFrame(lists_Yahoo,
                            columns=[
                                'rank', 'name', 'year', 'Yahoo_rating',
                                'country', 'genres', 'budget', 'revenue',
                                'keywords', 'MLens_rating', 'tmdb_id'
                            ])
    for i in range(510):
        df_Yahoo['year'][i] = re.sub('[()]', '', df_Yahoo['year'][i])
    df_Yahoo['year'] = df_Yahoo['year'].astype(int)
    if source == 'test':
        df_Yahoo.to_csv('./data/Yahoo_top_500_test.csv')
        print('Yahoo testing dataset processed')
    else:
        df_Yahoo.to_csv('./data/Yahoo_top_500.csv')
        print('Complete Yahoo dataset processed')
    return df_Yahoo
예제 #3
0
def get_movie_details(tmdbId, df_MovieLens):
    country, genres, budget, revenue, keywords, MLens_rating = '', '', np.NaN, np.NaN, '', np.NaN
    if tmdbId:
        movie_api = Movie()
        movie_api.wait_on_rate_limit = True
        movie_dict = movie_api.details(tmdbId).__dict__
        country_list = movie_dict['entries']['production_countries']
        if country_list:
            country = movie_dict['entries']['production_countries'][0][
                'iso_3166_1']
        genres_list = movie_dict['entries']['genres']
        genres = '/'.join([i['name'] for i in genres_list])
        budget = movie_dict['entries']['budget']
        revenue = movie_dict['entries']['revenue']
        keywords_list = movie_dict['entries']['keywords']['keywords']
        keywords = '/'.join([i['name'] for i in keywords_list])
        MLens_rating_Series = df_MovieLens[df_MovieLens['tmdbId'] ==
                                           tmdbId]['rating']
        if len(MLens_rating_Series) == 1:
            MLens_rating = float(MLens_rating_Series)
        #time.sleep(0.5) # to ensure not to exceed the rate limit of API

    return [country, genres, budget, revenue, keywords, MLens_rating, tmdbId]