def get_score_user_similarity_model(movie_data): ratings_df = movie_data.ratings_df train_scores = [] test_scores = [] n_iter = 1 for _ in xrange(n_iter): train_df, test_df = train_test_split(ratings_df) x_train, y_train = get_xy(train_df) x_test, y_test = get_xy(test_df) y_train_pred = get_y_pred_user_similarity_model(movie_data, x_train) y_test_pred = get_y_pred_user_similarity_model(movie_data, x_test) train_score, test_score = get_scores(y_test, y_test_pred, y_train, y_train_pred) train_scores.append(train_score) test_scores.append(test_score) print 'mean train score: %.4f, std: %.4f' % (np.mean(train_scores), np.std(train_scores)) print 'mean test score: %.4f, std: %.4f' % (np.mean(test_scores), np.std(test_scores))
def build_model(ratings_df): train_ratings_df, test_ratings_df = train_test_split(ratings_df) x_train, y_train = get_xy(train_ratings_df) x_test, y_test = get_xy(test_ratings_df) movie_matrix_data = MovieMatrixData(train_ratings_df) explore_rating_matrix(movie_matrix_data) rating_matrix = movie_matrix_data.rating_matrix p_matrix, q_matrix = factor(rating_matrix) model = p_matrix.dot(q_matrix) print model.shape print np.amin(model), np.amax(model) model = np.clip(model, a_min=movie_matrix_data.min_rating, a_max=movie_matrix_data.max_rating) y_train_pred = get_y_pred(movie_matrix_data, model, x_train) y_test_pred = get_y_pred(movie_matrix_data, model, x_test) train_score, test_score = get_scores(y_test, y_test_pred, y_train, y_train_pred) print 'train: %.3f, test: %.3f' % (train_score, test_score)
def fit(self, ratings_df): with elapsed_time('total fit'): for model in self.models: model.fit(ratings_df) x, y = get_xy(ratings_df) with elapsed_time('get blend predictions'): blend_predictions = self.get_blend_predictions(x) user_groups = x.groupby('userId') movie_groups = x.groupby('movieId') supports = [] for _, row in x.iterrows(): user_id = row['userId'] movie_id = row['movieId'] user_support = len(user_groups.get_group(user_id)) movie_support = len(movie_groups.get_group(movie_id)) support = min(user_support, movie_support) supports.append(support) # print Counter(supports) self.regression.fit(blend_predictions, y) print 'linear regression coefficients: %s, intercept: %.3f' % ( self.regression.coef_, self.regression.intercept_)
def build_model(ratings_df): train_scores = [] test_scores = [] train_rmse_scores = [] test_rmse_scores = [] n_iter = 1 # model = BaselineTotalMeanModel() # model = BaselineMeansModel(user_weight=0.5) # model = BaselineEffectsModel(movie_lambda=5.0, user_lambda=20.0) model = UserSimilarityModel(movie_lambda=5.0, user_lambda=20.0) for _ in xrange(n_iter): train_ratings_df, test_ratings_df = train_test_split(ratings_df) model = model.fit(train_ratings_df) x_train, y_train = get_xy(train_ratings_df) x_test, y_test = get_xy(test_ratings_df) with elapsed_time('scoring'): y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) train_score = r2_score(y_train, y_train_pred) test_score = r2_score(y_test, y_test_pred) train_rmse = root_mean_squared_error(y_train, y_train_pred) test_rmse = root_mean_squared_error(y_test, y_test_pred) train_scores.append(train_score) test_scores.append(test_score) train_rmse_scores.append(train_rmse) test_rmse_scores.append(test_rmse) print 'mean train score: %.4f, std: %.4f' % (np.mean(train_scores), np.std(train_scores)) print 'mean test score: %.4f, std: %.4f' % (np.mean(test_scores), np.std(test_scores)) print print 'mean train rmse: %.4f, std: %.4f' % (np.mean(train_rmse_scores), np.std(train_rmse_scores)) print 'mean test rmse: %.4f, std: %.4f' % (np.mean(test_rmse_scores), np.std(test_rmse_scores))
def fit(self, ratings_df): with elapsed_time('effects init'): _, y_train = get_xy(ratings_df) self.y_mean = y_train.mean() movie_ratings = ratings_df.groupby('movieId')['rating'] self.user_groups = ratings_df.groupby('userId') self.movie_effects = self.calculate_movie_effects(movie_ratings) self.user_effects = self.calculate_user_effects(self.user_groups) return self
def score_models(ratings_df, model_records): train_ratings_df, test_ratings_df = train_test_split(ratings_df) x_train, y_train = get_xy(train_ratings_df) x_test, y_test = get_xy(test_ratings_df) print for model_record in model_records: model_name, model = model_record model.fit(train_ratings_df) y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) train_r2_score = r2_score(y_train, y_train_pred) test_r2_score = r2_score(y_test, y_test_pred) train_rmse = root_mean_squared_error(y_train, y_train_pred) test_rmse = root_mean_squared_error(y_test, y_test_pred) print '%s' % model_name print 'train r2 score: %.4f, test r2 score: %.4f' % (train_r2_score, test_r2_score) print 'train rmse: %.4f, test rmse: %.4f\n' % (train_rmse, test_rmse)
def fit(self, ratings_df): _, y_train = get_xy(ratings_df) self.y_mean = y_train.mean() return self