Exemplo n.º 1
0
def setup_test_data(min_reviews=2, local=False):
    """ Creating a dataframe per user with the inputs and target for testing """

    # Fetching the review dataframe
    print("Fetching the review dataframe...")

    if local:
        input_csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed")
        data = pd.read_csv(f'{input_csv_path}/review_pp.csv')
    else:
        data = storage.import_file('data/preprocessed', 'review_pp.csv')

    # Creating user / reviews dict
    print("Creating user and reviews dict...")
    user_reviews = data.groupby('user_id') \
                       .agg({'recipe_id': (lambda x: list(x)),
                             'rating': (lambda x: list(x)),
                             'liked': (lambda x: list(x))}) \
                       .reset_index()

    # Selecting only users with at least min_reviews (2 by default)
    print(f"Selecting only users with at least {min_reviews} reviews...")
    selected_user_reviews = user_reviews[user_reviews['recipe_id'].str.len() >= min_reviews]

    print("Create and filling evaluation dataframe...")

    # Create evaluation dataframe
    test_df = pd.DataFrame(columns=['user_id', 'inputs', 'target', 'rating', 'liked'])

    # Filling in the evaluation dataframe
    for index, row in selected_user_reviews.iterrows():
        user = row['user_id']
        target = row['recipe_id'][-1]
        liked = row['liked'][-1]
        rating = row['rating'][-1]
        inputs = {row['recipe_id'][i]: row['liked'][i] for i in range(len(row['recipe_id']) - 1)}

        new_row = {'user_id': user, 'inputs': inputs, 'target': target, 'rating': rating, 'liked': liked}
        test_df = test_df.append(new_row, ignore_index=True)

    print("Saving test input dataframe...")

    # timestamp = '{:%Y%m%d_%H%M}'.format(datetime.datetime.now())

    if local:
        output_csv_path = os.path.join(os.path.dirname(__file__), "data/test")
        test_df.to_csv(f'{output_csv_path}/test_inputs.csv', index=False)
    else:
        storage.upload_file(test_df, 'data/test', 'test_inputs.csv')

    return test_df
Exemplo n.º 2
0
def run_recommendations(user_id=None, collaborative=0.5, clear_neg=False,
                        vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0, local = False):

    if local:
        csv_path = os.path.join(os.path.dirname(__file__), "data")
        test_df = pd.read_csv(f'{csv_path}/test/test_inputs.csv')
        recipe_df = pd.read_csv(f"{csv_path}/preprocessed/recipe_pp.csv")
    else:
        test_df = storage.import_file('data/test', 'test_inputs.csv')
        recipe_df = storage.import_file('data/preprocessed', 'recipe_pp.csv')

    if user_id:
        test_case = test_df[test_df.user_id == user_id]
    else:
        test_case = test_df.sample()

    inputs = eval(test_case.inputs.values[0])

    input_df = pd.DataFrame(columns=['recipe_id', 'liked'])
    for recipe, liked in inputs.items():
        input_df = input_df.append({'recipe_id': recipe, 'liked': liked}, ignore_index=True)

    input_df = input_df.merge(recipe_df, on='recipe_id', how='left')\
               [['recipe_id', 'name', 'liked']]

    display(input_df)

    recommendations = get_user_recommendations(user_inputs = inputs, collaborative = collaborative, clear_neg = clear_neg,
                                               vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0)

    output_df = recommendations.merge(recipe_df, on='recipe_id', how='left') \
                [['recipe_id', 'name', 'content', 'collaborative', 'hybrid', 'rec_score']]

    display(output_df.head(10))

    return input_df, output_df
Exemplo n.º 3
0
def run_test(predict = True, sample = None, vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0, local = False):
    """ Running the test, by computing predictions and preparing the result dataframe """
    pd.options.mode.chained_assignment = None

    print("Fetching the test inputs...")

    if local:
        input_csv_path = os.path.join(os.path.dirname(__file__), "data/test")
        input_data = pd.read_csv(f'{input_csv_path}/test_inputs.csv')
    else:
        input_data = storage.import_file('data/test', 'test_inputs.csv')

    print("Calculating predictions...")

    if sample is None:
        data = input_data.copy()
    else:
        data = input_data.sample(sample, random_state=42)

    if predict:
        predictions = list()

        for index, test in data.iterrows():
            prediction_matrix = get_user_recommendations(user_inputs=eval(test['inputs']), clear_neg=False, user_id=test['user_id'], forced_recipes=[test['target']],
                                                         vectorizer = 'count', dimred = 'svd', ngram = (1,1), min_df = 1, max_df = 1.0)

            prediction_row = prediction_matrix[prediction_matrix.index == test['target']]

            if len(prediction_row) > 0:
                pred = float(prediction_row['rec_score'])
                predictions.append(np.round(pred, 3))
                print(f"> ({index}) Prediction for user {test['user_id']} done: {pred}!")

            else:
                predictions.append(None)
                print(f"> ({index}) Prediction for user {test['user_id']} not found!")

    else:
        ln = lognorm.rvs(0.2, size=data.shape[0])
        predictions = (ln - ln.min()) / (ln.max() - ln.min())

    print("Preparing results dataframe...")
    data['rec_score'] = predictions

    print("Cleaning up failed scores...")
    data = data[data['rec_score'] >= 0]

    data['rec_rating'] = __convert_to_rating(data[['rating', 'rec_score']])
    data['rec_liked'] = 0
    data['rec_classify'] = ''

    print("Iterating and filling results dataframe...")

    for index, row in data.iterrows():
        if data.loc[index, 'rec_rating'] >= 4:
            data.loc[index, 'rec_liked'] = 1

        actual = data.loc[index, 'liked'] == 1
        predict = data.loc[index, 'rec_liked'] == 1

        if predict and actual:
            data.loc[index, 'rec_classify'] = 'TP'
        elif predict and not actual:
            data.loc[index, 'rec_classify'] = 'FP'
        elif not predict and not actual:
            data.loc[index, 'rec_classify'] = 'TN'
        elif not predict and actual:
            data.loc[index, 'rec_classify'] = 'FN'

    print("Saving results dataframe...")

    timestamp = '{:%Y%m%d_%H%M}'.format(datetime.datetime.now())

    if local:
        csv_path = os.path.join(os.path.dirname(__file__), "data/test")
        data.to_csv(f'{csv_path}/test_outputs_{timestamp}.csv', index=False)
    else:
        storage.upload_file(data, 'data/test', f'test_outputs_{timestamp}.csv')

    print("Calculating metrics for tests...")

    metrics = get_scoring_metrics(data)

    # Returns tuple of data and metrics
    return data, metrics
Exemplo n.º 4
0
def __create_latent_matrices(pool = 2000, content_reduction = 250, rating_reduction = 800,
                             user_inputs = None, user_id = None, forced_recipes = [],
                             goal = '', diet = '', allergies = [], dislikes = [],
                             custom_dsl = '', time = None, steps = None,
                             vectorizer = 'count', dimred = 'svd',
                             ngram = (1,1), min_df = 1, max_df = 1.0, local = False):

    ''' Generates the latent dataframes used for the prediction model '''

    #### First the data needs to be loaded
    if user_inputs is None:
        user_inputs = TEST_USER
    user_recipes = list(user_inputs.keys())

    if local:
        csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed")
        recipes_df_raw = pd.read_csv(f"{csv_path}/recipe_pp.csv")
        reviews_df_raw = pd.read_csv(f"{csv_path}/review_pp.csv")
    else:
        recipes_df_raw = storage.import_file('data/preprocessed', 'recipe_pp.csv')
        reviews_df_raw = storage.import_file('data/preprocessed', 'review_pp.csv')

    # For test purposes only
    if forced_recipes and user_id:
        for fr in forced_recipes:
            reviews_df_raw = reviews_df_raw[~((reviews_df_raw['recipe_id'] == fr) & (reviews_df_raw['user_id'] == user_id))]

    user_recipe_df = recipes_df_raw[recipes_df_raw.recipe_id.isin(user_recipes)]
    other_recipes_df = recipes_df_raw[~recipes_df_raw.recipe_id.isin(user_recipes + forced_recipes)]
    forced_recipes_df = recipes_df_raw[recipes_df_raw.recipe_id.isin(forced_recipes)]

    sample = np.min([pool, (len(other_recipes_df) + len(forced_recipes_df))])
    target_df = pd.concat([other_recipes_df.sample(sample - len(forced_recipes_df), random_state=42), forced_recipes_df], axis=0)
    # print(target_df.shape)

    ### Filter method here:
    filtered_df = filters.all_filters(target_df, goal=goal, diet=diet, allergies=allergies, dislikes=dislikes,
                                                 custom_dsl=custom_dsl, time=time, steps=steps)
    # print(filtered_df.shape)

    input_df = pd.concat([user_recipe_df, filtered_df], axis=0)
    # print(input_df.shape)

    merge_df = pd.merge(input_df[['recipe_id', 'metadata']], reviews_df_raw, on="recipe_id", how="left").dropna()
    recipes_df = merge_df[['recipe_id', 'metadata']].groupby(by="recipe_id").first().reset_index()
    reviews_df = merge_df.drop(['metadata'], axis="columns").reset_index()
    # print(recipes_df.shape)

    ######################################################################
    #### Using count vectorizer to create content based latent matrix ####
    #### use dimension reduction with TruncatedSVD                    ####
    ######################################################################

    if vectorizer == 'count':
        vector = CountVectorizer(stop_words='english', ngram_range=ngram, min_df=min_df, max_df=max_df)
        vector_matrix = vector.fit_transform(recipes_df['metadata'])

    elif vectorizer == 'tfidf':
        vector = TfidfVectorizer(stop_words='english', ngram_range=ngram, min_df=min_df, max_df=max_df)
        vector_matrix = vector.fit_transform(recipes_df['metadata'])

    vector_df = pd.DataFrame(vector_matrix.toarray(), index=recipes_df.recipe_id.tolist())

    if dimred == 'svd':
        base_case = TruncatedSVD(n_components = 1000)
        base_case.fit_transform(vector_df)
        cumsum = base_case.explained_variance_ratio_.cumsum()
        content_reduction = max(100, len(cumsum[cumsum <= 0.8]))
        redutor = TruncatedSVD(n_components = content_reduction)

    elif dimred == 'nmf':
        base_case = NMF(n_components = 1000)
        base_case.fit_transform(vector_df)
        cumsum = base_case.explained_variance_ratio_.cumsum()
        content_reduction = max(100, len(cumsum[cumsum <= 0.8]))
        redutor = NMF(n_components = content_reduction)

    latent_df = redutor.fit_transform(vector_df)
    latent_df = pd.DataFrame(latent_df[:,0:content_reduction], index=recipes_df.recipe_id.tolist())

    ##################################################################
    #### Using user ratings to create content based latent matrix ####
    #### use dimension reduction with TruncatedSVD                ####
    ##################################################################

    ratings_basis = pd.merge(recipes_df[['recipe_id']], reviews_df, on="recipe_id", how="right")
    ratings = ratings_basis.pivot(index = 'recipe_id', columns ='user_id', values = 'rating').fillna(0)

    if dimred == 'svd':
        base_case = TruncatedSVD(n_components = 1000)
        base_case.fit_transform(ratings)
        cumsum = base_case.explained_variance_ratio_.cumsum()
        rating_reduction = max(100, len(cumsum[cumsum <= 0.8]))
        redutor = TruncatedSVD(n_components = rating_reduction)

    elif dimred == 'nmf':
        base_case = NMF(n_components = 1000)
        base_case.fit_transform(ratings)
        cumsum = base_case.explained_variance_ratio_.cumsum()
        rating_reduction = max(100, len(cumsum[cumsum <= 0.8]))
        redutor = NMF(n_components = rating_reduction)

    latent_df_2 = redutor.fit_transform(ratings)
    index_list = reviews_df.groupby(by="recipe_id").mean().index.tolist()
    latent_df_2 = pd.DataFrame(latent_df_2, index=index_list)

    #####################################
    #### Exporting latent DataFrames ####
    #####################################

    return latent_df, latent_df_2
Exemplo n.º 5
0
    def train(self, algo='SVD', like=True, test='cv', local=False):

        if local:
            csv_path = os.path.join(os.path.dirname(__file__),
                                    "data/preprocessed")
            self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv")
            self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv")
        else:
            self.recipes = storage.import_file('data/preprocessed',
                                               'recipe_pp.csv')
            self.reviews = storage.import_file('data/preprocessed',
                                               'review_pp.csv')

        if like:
            self.target = 'liked'
            self.s_min = 0
            self.s_max = 1
        else:
            self.target = 'rating'
            self.s_min = 1
            self.s_max = 5

        reader = Reader(rating_scale=(self.s_min, self.s_max))

        self.relevant_data = self.reviews[[
            'user_id', 'recipe_id', self.target
        ]]
        model_data = Dataset.load_from_df(self.relevant_data, reader)

        # Algos

        if 'NormalPredictor':
            self.algorithm = NormalPredictor()

        elif 'BaselineOnly':
            self.algorithm = BaselineOnly()

        elif 'KNNBasic':
            self.algorithm = KNNBasic()

        elif 'KNNWithMeans':
            self.algorithm = KNNWithMeans()

        elif 'KNNWithZScore':
            self.algorithm = KNNWithZScore()

        elif 'KNNBaseline':
            self.algorithm = KNNBaseline()

        elif 'SVD':
            params = {
                'n_epochs': 20,
                'n_factors': 100,
                'lr_all': 0.002,
                'reg_all': 0.02
            }
            self.algorithm = SVD(params)  # Tuned with svd_grid

        elif 'SVDpp':
            self.algorithm = SVDpp()

        elif 'NMF':
            self.algorithm = NMF()

        elif 'SlopeOne':
            self.algorithm = SlopeOne()

        elif 'CoClustering':
            self.algorithm = CoClustering()

        if test == 'cv':
            cv_results = cross_validate(self.algorithm,
                                        model_data,
                                        measures=['RMSE', 'MAE'],
                                        cv=5,
                                        verbose=True)
            rmse = np.round(cv_results['test_rmse'].mean(), 3)
            mae = np.round(cv_results['test_mae'].mean(), 3)
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        elif test == 'svd_grid':
            param_grid = {
                'n_epochs': [10, 20],
                'n_factors': [100, 200],
                'lr_all': [0.001, 0.002],
                'reg_all': [0.01, 0.02]
            }
            train_data = model_data.build_full_trainset()
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            gs.fit(model_data)
            rmse = gs.best_score['rmse']
            mae = gs.best_score['mae']
            print(gs.best_params['rmse'], gs.best_params['mae'])
            self.algorithm = gs.best_estimator['rmse']
            train_data = model_data.build_full_trainset()
            self.algorithm.fit(train_data)

        else:
            train, test = train_test_split(model_data,
                                           test_size=0.3,
                                           random_state=42)
            self.algorithm.fit(train)
            predictions = self.algorithm.test(test)
            rmse = np.round(accuracy.rmse(predictions), 3)
            mae = np.round(accuracy.mae(predictions), 3)

        return rmse, mae
Exemplo n.º 6
0
def create_latent_matrices(vectorizer='tfidf',
                           dimred='svd',
                           ngram=(1, 1),
                           min_df=1,
                           max_df=1.0,
                           local=False):
    ''' Generates the latent dataframes used for the prediction model '''

    print("\n***** Creating Latent Matrices *****")
    print("Loading preprocessed data for recipes and reviews...")

    if local:
        csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed")
        recipes_df = pd.read_csv(f"{csv_path}/recipe_pp.csv")
        reviews_df = pd.read_csv(f"{csv_path}/review_pp.csv")

    else:
        recipes_df = storage.import_file('data/preprocessed', 'recipe_pp.csv')
        reviews_df = storage.import_file('data/preprocessed', 'review_pp.csv')

    # Test purposes:
    # recipes_df = recipes_df.sample(100)
    # reviews_df = reviews_df[reviews_df['recipe_id'].isin(recipes_df['recipe_id'])]

    print(f"Vectorizing metadata using {vectorizer.upper()} approach...")
    print(f"> Applying ngram {ngram}, min_df {min_df} and max_df {max_df}")

    if vectorizer == 'count':
        vector = CountVectorizer(stop_words='english',
                                 ngram_range=ngram,
                                 min_df=min_df,
                                 max_df=max_df)
        vector_matrix = vector.fit_transform(recipes_df['metadata'])

    elif vectorizer == 'tfidf':
        vector = TfidfVectorizer(stop_words='english',
                                 ngram_range=ngram,
                                 min_df=min_df,
                                 max_df=max_df)
        vector_matrix = vector.fit_transform(recipes_df['metadata'])

    vector_df = pd.DataFrame(vector_matrix.toarray(),
                             index=recipes_df.recipe_id.tolist())

    print(
        f"Reducing metadata vector dimensions using the {dimred.upper()} approach..."
    )

    if dimred == 'svd':
        m_base_case = TruncatedSVD(n_components=min(vector_df.shape[1] -
                                                    1, 1000))
        m_base_case.fit_transform(vector_df)
        m_cumsum = m_base_case.explained_variance_ratio_.cumsum()
        content_reduction = len(m_cumsum[m_cumsum <= 0.8])
        print(f"> {content_reduction} components considered...")
        m_redutor = TruncatedSVD(n_components=content_reduction)

    elif dimred == 'nmf':
        m_base_case = NMF(n_components=min(vector_df.shape[1] - 1, 1000))
        m_base_case.fit_transform(vector_df)
        m_cumsum = m_base_case.explained_variance_ratio_.cumsum()
        content_reduction = len(m_cumsum[m_cumsum <= 0.8])
        print(f"> {content_reduction} components considered...")
        m_redutor = NMF(n_components=content_reduction)

    print("Creating metadata's latent dataframe...")

    m_latent_matrix = m_redutor.fit_transform(vector_df)
    content_latent = pd.DataFrame(m_latent_matrix[:, 0:content_reduction],
                                  index=recipes_df.recipe_id.tolist())

    print("Pivoting ratings to user/recipe matrix...")
    ratings_basis = reviews_df.sort_values(by="recipe_id")
    ratings = (ratings_basis.groupby(['recipe_id', 'user_id'
                                      ]).rating.first().unstack()).fillna(0)

    print(
        f"Reducing rating vector dimensions using the {dimred.upper()} approach..."
    )

    if dimred == 'svd':
        r_base_case = TruncatedSVD(n_components=min(ratings.shape[1] -
                                                    1, 1000))
        r_base_case.fit_transform(ratings)
        r_cumsum = r_base_case.explained_variance_ratio_.cumsum()
        rating_reduction = len(r_cumsum[r_cumsum <= 0.8])
        r_redutor = TruncatedSVD(n_components=rating_reduction)

    elif dimred == 'nmf':
        r_base_case = NMF(n_components=min(ratings.shape[1] - 1, 1000))
        r_base_case.fit_transform(ratings)
        r_cumsum = r_base_case.explained_variance_ratio_.cumsum()
        rating_reduction = len(r_cumsum[r_cumsum <= 0.8])
        r_redutor = NMF(n_components=rating_reduction)

    print("Creating rating's latent dataframe...")

    r_latent_matrix = r_redutor.fit_transform(ratings)
    r_index_list = reviews_df.groupby(by="recipe_id").mean().index.tolist()
    rating_latent = pd.DataFrame(r_latent_matrix, index=r_index_list)

    print("Exporting latent matrixes as CSV...")

    storage.upload_file(content_latent, 'data/models', 'content_latent.csv')
    storage.upload_file(rating_latent, 'data/models', 'rating_latent.csv')

    print("Latent matrix preparation and exporting done!")

    return content_latent, rating_latent
Exemplo n.º 7
0
def get_user_recommendations(user_inputs=None,
                             n_recommendations=None,
                             collaborative=0.5,
                             clear_neg=False,
                             content_latent=None,
                             rating_latent=None):
    ''' Gets the recommendations for one user by taking all of its liked and disliked dishes,
        getting the recommendation based on each recipe and then summing the scores '''

    print("\n***** Calculating Recommendations *****")

    if user_inputs is None:
        user_inputs = TEST_USER

    print("Loading latent matrixes from CSV...")

    if content_latent is None:
        content_latent = storage.import_file(
            'data/models',
            'content_latent.csv').rename(columns={
                'Unnamed: 0': 'recipe_id'
            }).set_index("recipe_id")

    if rating_latent is None:
        rating_latent = storage.import_file(
            'data/models',
            'rating_latent.csv').rename(columns={
                'Unnamed: 0': 'recipe_id'
            }).set_index("recipe_id")

    print("Listing likes/dislikes and running individual recommendations...")

    user_likes = [
        recipe for recipe, liked in user_inputs.items() if liked == 1
    ]
    user_dislikes = [
        recipe for recipe, liked in user_inputs.items() if liked == 0
    ]

    if user_likes:
        recommendations = [
            get_recommendation(recipe, content_latent, rating_latent,
                               collaborative) for recipe in user_likes
        ]
        recommendations_df = pd.concat(recommendations)

    if user_dislikes:
        dislikes = [
            get_recommendation(recipe, content_latent, rating_latent,
                               collaborative) for recipe in user_dislikes
        ]
        dislike_df = pd.concat(dislikes)
        dislike_df[[
            'content', 'collaborative', 'hybrid'
        ]] = dislike_df[['content', 'collaborative', 'hybrid']] * (-1)

    print("Grouping and summing recommendation matrixes...")

    if user_likes and user_dislikes:
        complete_recs = pd.concat([recommendations_df, dislike_df], axis=0)
    elif user_likes and (not user_dislikes):
        complete_recs = recommendations_df
    elif (not user_likes) and user_dislikes:
        complete_recs = dislike_df

    grouped_recommendations = complete_recs.groupby(
        by="recipe_id").sum().sort_values(by="hybrid", ascending=False)
    grouped_recommendations = grouped_recommendations[
        ~grouped_recommendations.index.isin(user_likes + user_dislikes)]

    if clear_neg:
        grouped_recommendations = grouped_recommendations[
            grouped_recommendations['hybrid'] > 0]

    print("Generating recommendation scores...")

    score_min = grouped_recommendations['hybrid'].min()
    score_max = grouped_recommendations['hybrid'].max()
    score_dif = score_max - score_min

    grouped_recommendations['rec_score'] = np.round(
        (grouped_recommendations['hybrid'] - score_min) / score_dif, 3)
    grouped_recommendations.sort_values(by='rec_score',
                                        ascending=False,
                                        inplace=True)

    print("Returning final recommendation matrix!")

    if n_recommendations:
        grouped_recommendations = grouped_recommendations.head(
            n_recommendations)

    return grouped_recommendations