예제 #1
0
def peuimportelenom():

    noms= request.form.getlist("dblst_artists")
    sugg= []
    #print(noms)

    for el in noms:
        artiste= ap[ap.name== el]
        lind= list(artiste.artistID)[0] -1
        vecteur[lind]= artiste.playCountScaled.median()

    # création de la matrice
    X= np.vstack((ratings,vecteur))
    
    # On importe le code du jupyter notebook
    n_users, n_items = X.shape

    Xcsr = csr_matrix(X)
    Xcoo = Xcsr.tocoo()
    data = Dataset()
    data.fit(np.arange(n_users), np.arange(n_items))
    interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
    train, test = random_train_test_split(interactions)

    model = LightFM(learning_rate=0.05, loss='warp')
    model.fit(train, epochs=10, num_threads=2)

    scores = model.predict(0, vecteur)
    top_items = ap["name"].unique()[np.argsort(-scores)]

    sugg= top_items[:10]
    

    return render_template("page.html", artist_names= artist_names, noms= noms, sugg= sugg)
예제 #2
0
    def build_id_mappings(self, hybrid=False) -> Dataset:
        """Builds internal indice mapping for user-item interactions and encodes item features.

        Reads in user-item interactions and the features associated with each item and builds a mapping
        between the user and item ids from our input data to indices that will be used internally by our model.

        Item features are further encoded as an argument passed to Dataset.fit. These are supplied as a flat
        list of unique item features for the entire dataset.

        Args:
            df_interactions (pandas.DataFrame): User-Item interactions DataFrame consisting of user and item IDs.
            df_item_features (pandas.DataFrame): Item IDs and their corresponding features as column separated values.

        Returns:
            lightfm.data.Dataset: Tool for building interaction and feature matrices,
                taking care of the mapping between user/item ids and feature names and internal feature indices.
            tag_sector (list): list of all the unique cashtag sector information in the dataset.
            tag_industry (list): list of all the unique cashtag industries information in the dataset.
            :param hybrid:

        """

        dataset = Dataset()
        dataset.fit(
            (x for x in self.df['user_id']), (x for x in self.df['tag_id']),
            item_features=(x
                           for x in self.df['tag_sector']) if hybrid else None)
        return dataset
def create_dataset(users, movies):
    dataset = Dataset()

    dataset.fit(
        users=[x["id"] for x in users],
        items=[x["id"] for x in movies],
        item_features=create_movie_features_set(movies),
    )

    return dataset
예제 #4
0
def prepareData(df, tags):
    df = df[df.actionCategory == "WebNei clicked"]
    actionByUsers = df.groupby(["userName", "actionName"]).size()
    uniqueUsers = df[df.userName.isin(
        actionByUsers.index.get_level_values(
            0).unique().values)].drop_duplicates('userName')
    uniqueUsers['user_features'] = uniqueUsers[[
        'title', 'team', 'organization', 'department'
    ]].values.tolist()
    dataset = Dataset()
    dataset.fit((list(actionByUsers.index.get_level_values(0))),
                (list(actionByUsers.index.get_level_values(1))))

    rowM, colM = prepareJson(tags)
    rowU, colU = prepareUserFeatures(uniqueUsers)

    dataset.fit_partial(items=rowM,
                        item_features=colM,
                        users=rowU,
                        user_features=colU)

    (interactions, weights) = dataset.build_interactions(
        zip(list(actionByUsers.index.get_level_values(0)),
            list(actionByUsers.index.get_level_values(1))))
    item_features = dataset.build_item_features(zip(rowM, [colM]))
    user_features = dataset.build_user_features(zip(rowU, [colU]))
    return interactions, item_features, user_features
예제 #5
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
예제 #6
0
def create_recommender():
	# obtain interaction table from dynamodb, which is json data
	dynamodb = boto3.resource('dynamodb')
	table = dynamodb.Table('eye_video_vote')
	response = table.scan()
	raw_data = response['Items']
	
		
	#transform json structure data to user-item-rating interaction format
	final_df = pd.DataFrame(columns=['userId','videoId','rating'])

	for i in raw_data:
#    data = raw_data[i]
		if any('upVote' in s for s in list(i.keys())):
			df1 = {k:i[k] for k in ('upVote','videoId')}
			df1['videoId'] = {df1['videoId']}
			df1 = pd.DataFrame.from_dict(df1, orient='index').T
			df1['rating'] = randint(4, 5)
			df1.fillna(value = pd.np.nan, inplace=True)
			df1 = df1.fillna(method='ffill')
			df1.rename(columns={'upVote':'userId'},inplace=True)
			final_df = final_df.append(df1)
		if any('downVote' in s for s in list(i.keys())):
			df2 = {k:i[k] for k in ('downVote','videoId')}
			df2['videoId'] = {df2['videoId']}
			df2 = pd.DataFrame.from_dict(df2, orient='index').T
			df2['rating'] = randint(1, 2)
			df2.fillna(value = pd.np.nan, inplace=True)
			df2 = df2.fillna(method='ffill')
			df2.rename(columns={'downVote':'userId'},inplace=True)
			final_df = final_df.append(df2)
	
	
	#rename the columns 
	final_df.rename(columns={'userId':'UserID', 'videoId':'MovieID', 'rating':'rating'}, inplace=True)
		
	#generate the appropriate lightfm dataset
	dataset = Dataset()
	dataset.fit(users = (row['UserID'] for index,row in final_df.iterrows()),
				items = (row['MovieID'] for index,row in final_df.iterrows()))
	
	(interactions, weights) = dataset.build_interactions((row['UserID'],row['MovieID'],row['rating']) for index,row in final_df.iterrows())
	
	#model collabrative filtering
	model_cf = LightFM(no_components=20, loss='warp')
	model_cf.fit(interactions, user_features=None, item_features=None, sample_weight=None, epochs=20, num_threads=4)
	with open('model_cf.pickle', 'wb') as fle:
		pickle.dump(model_cf, fle, protocol=pickle.HIGHEST_PROTOCOL)
	
	return 
예제 #7
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
예제 #8
0
def test_build_features():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(
        range(users),
        range(items),
        ["user:{}".format(x) for x in range(users)],
        ["item:{}".format(x) for x in range(items)],
    )

    # Build from lists
    user_features = dataset.build_user_features(
        [
            (user_id, ["user:{}".format(x) for x in range(users)])
            for user_id in range(users)
        ]
    )
    assert user_features.getnnz() == users ** 2

    item_features = dataset.build_item_features(
        [
            (item_id, ["item:{}".format(x) for x in range(items)])
            for item_id in range(items)
        ]
    )
    assert item_features.getnnz() == items ** 2

    # Build from dicts
    user_features = dataset.build_user_features(
        [
            (user_id, {"user:{}".format(x): float(x) for x in range(users)})
            for user_id in range(users)
        ],
        normalize=False,
    )

    assert np.all(user_features.todense() == np.array([list(range(users))] * users))

    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ],
        normalize=False,
    )

    assert np.all(item_features.todense() == np.array([list(range(items))] * items))

    # Test normalization
    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ]
    )

    assert np.all(item_features.sum(1) == 1.0)
예제 #9
0
def predict_artist_list(artist_select): 
# Build a user-artist rating matrix 
    ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
    ratings = ratings_df.fillna(0).values

    artist_names = ap.sort_values("artistID")["name"].unique()

    add_user = [0]*17632

    new_list = []

    for item in artist_select:
        artists_idx = artists.index[artists["name"] == item]
        new_list.append(artists_idx)
        for i in new_list : 
            for j in i :
                index = j 
                add_user[index] = 1
            new_ratings_df = np.vstack((ratings_df, add_user))
            ratings_df = pd.DataFrame(new_ratings_df)
    new_userID = (ratings_df.shape[0] - 1)     
    ratings = ratings_df.fillna(0).values

    # Build a sparse matrix
    X = csr_matrix(ratings)

    n_users, n_items = ratings_df.shape

    user_ids = ratings_df.index.values
    artist_names = ap.sort_values("artistID")["name"].unique()


    # Build data references + train test
    Xcoo = X.tocoo()
    data = Dataset()
    data.fit(np.arange(n_users), np.arange(n_items))
    interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
    train, test = random_train_test_split(interactions)



    model = LightFM(learning_rate=0.05, loss='warp')
    model.fit(train, epochs=10, num_threads=2)


    # Predict
    scores = model.predict(0, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    return top_items[0:10]
예제 #10
0
def init_lightfm_dataset(unique_elements,
                         user_features=None,
                         movie_features=None):
    unique_users = unique_elements[0]
    unique_movies = unique_elements[1]
    if (user_features is not None):
        user_features = [*user_features[0][1]]
    if (movie_features is not None):
        movie_features = [*movie_features[0][1]]
    dataset = Dataset()
    dataset.fit(users=unique_users,
                items=unique_movies,
                user_features=user_features,
                item_features=movie_features)
    return (dataset)
예제 #11
0
    def fit(self):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        self.dataset = Dataset()
        self.dataset.fit(users=user_list,
                         items=book_list,
                         item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        interactions, weights = self.dataset.build_interactions(rating_list)

        book_features = DataPrep.create_features()
        books_features = self.dataset.build_item_features(book_features)
        return interactions, weights, books_features
예제 #12
0
def test_build_features():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(
        range(users),
        range(items),
        ["user:{}".format(x) for x in range(users)],
        ["item:{}".format(x) for x in range(items)],
    )

    # Build from lists
    user_features = dataset.build_user_features(
        [(user_id, ["user:{}".format(x) for x in range(users)]) for user_id in range(users)]
    )
    assert user_features.getnnz() == users ** 2

    item_features = dataset.build_item_features(
        [(item_id, ["item:{}".format(x) for x in range(items)]) for item_id in range(items)]
    )
    assert item_features.getnnz() == items ** 2

    # Build from dicts
    user_features = dataset.build_user_features(
        [
            (user_id, {"user:{}".format(x): float(x) for x in range(users)})
            for user_id in range(users)
        ],
        normalize=False,
    )

    assert np.all(user_features.todense() == np.array([list(range(users))] * users))

    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ],
        normalize=False,
    )

    assert np.all(item_features.todense() == np.array([list(range(items))] * items))

    # Test normalization
    item_features = dataset.build_item_features(
        [
            (item_id, {"item:{}".format(x): float(x) for x in range(items)})
            for item_id in range(items)
        ]
    )

    assert np.all(item_features.sum(1) == 1.0)
def create_dataset(df):
    ## create a mapping between the user and item ids from our input data
    #to indices that will be used internally by the model
    dataset = Dataset()
    list_user_names = list(df.index)
    list_items = df.columns.values
    dataset.fit((user_name for user_name in list_user_names),
                (item for item in list_items))

    ## Build the interaction matrix
    # it encodes the interactions betwee users and items.
    # need (user, item) pair that has 1's in df
    list_pairs = list(df.stack().index)
    (interactions, weights) = dataset.build_interactions(
        (pair for pair in list_pairs))

    return dataset, interactions, weights
    def obtener_matrices(self):
        """
        Método obtener_matrices. Obtiene las matrices necesarias para la creación de los modelos de LightFM.

        Este método solo se utiliza en la interfaz de texto.
        """

        global train, test, modelo, item_features, user_features

        # Se obtienen los dataframes
        Entrada.obtener_datos()
        ratings_df = Entrada.ratings_df
        users_df = Entrada.users_df
        items_df = Entrada.items_df

        # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos
        dataset = Dataset()
        dataset.fit(users_df[users_df.columns.values[0]],
                    items_df[items_df.columns.values[0]],
                    user_features=users_df[users_df.columns.values[1]],
                    item_features=items_df[items_df.columns.values[1]])

        # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios
        if self.opcion_modelo == 1 or self.opcion_modelo == 2:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]],
                 row[ratings_df.columns.values[2]])
                for index, row in ratings_df.iterrows())
        else:
            (interacciones, pesos) = dataset.build_interactions(
                (row[ratings_df.columns.values[0]],
                 row[ratings_df.columns.values[1]])
                for index, row in ratings_df.iterrows())

        # Se obtienen las matrices de features y se guardan
        item_features = dataset.build_item_features(
            (row[items_df.columns.values[0]],
             [row[items_df.columns.values[1]]])
            for index, row in items_df.iterrows())
        user_features = dataset.build_user_features(
            (row[users_df.columns.values[0]],
             [row[users_df.columns.values[1]]])
            for index, row in users_df.iterrows())
        print("Guarda la matriz de item features")
        guardar_datos_pickle(item_features, 'la matriz de item features')
        print("Guarda la matriz de user features")
        guardar_datos_pickle(user_features, 'la matriz de user feautures')

        # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan
        train, test = random_train_test_split(interacciones,
                                              test_percentage=0.2)
        print("Guarda la matriz de entrenamiento")
        guardar_datos_pickle(train, 'la matriz de entrenamiento')
        print("Guarda la matriz de test")
        guardar_datos_pickle(test, 'la matriz de test')
예제 #15
0
def test_fitting_no_identity():

    users, items = 10, 100

    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, 0)
    assert dataset.item_features_shape() == (items, 0)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([], normalize=False).getnnz() == 0
    assert dataset.build_item_features([], normalize=False).getnnz() == 0
예제 #16
0
def test_fitting():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    assert dataset.interactions_shape() == (users, items)
    assert dataset.user_features_shape() == (users, users)
    assert dataset.item_features_shape() == (items, items)

    assert dataset.build_interactions([])[0].shape == (users, items)
    assert dataset.build_user_features([]).getnnz() == users
    assert dataset.build_item_features([]).getnnz() == items
예제 #17
0
def interactions(df):
    movie_genre = [x.split("|") for x in df["genre"]]
    all_movie_genre = sorted(
        list(set(itertools.chain.from_iterable(movie_genre))))

    all_occupations = sorted(list(set(df["occupation"])))

    dataset = Dataset()
    dataset.fit(
        df["userID"],
        df["itemID"],
        item_features=all_movie_genre,
        user_features=all_occupations,
    )

    item_features = dataset.build_item_features(
        (x, y) for x, y in zip(df.itemID, movie_genre))

    user_features = dataset.build_user_features(
        (x, [y]) for x, y in zip(df.userID, df["occupation"]))

    (interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)

    train_interactions, test_interactions = cross_validation.random_train_test_split(
        interactions,
        test_percentage=TEST_PERCENTAGE,
        random_state=np.random.RandomState(SEEDNO),
    )
    return train_interactions, test_interactions, item_features, user_features
예제 #18
0
def lightfm_trainer(train: np.ndarray, loss: str, n_components: int,
                    lam: float) -> None:
    """Train lightfm models."""
    model = LightFM(
        loss=loss,
        user_alpha=lam,
        item_alpha=lam,
        no_components=n_components,
        learning_rate=0.001,
        random_state=12345,
    )
    dataset = Dataset()
    dataset.fit(train[:, 0], train[:, 1])
    (interactions, weights) = dataset.build_interactions(
        ((x[0], x[1], 1) for x in train[train[:, 2] == 1]))
    model.fit(interactions, epochs=100)

    return model
예제 #19
0
def main():
    current_stage = 6
    model = LightFM(no_components=30)
    dataset = Dataset()

    for c in range(0, current_stage + 1):
        click_train = pd.read_csv(
            train_path + "/underexpose_train_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        click_test = pd.read_csv(
            test_path + "/underexpose_test_click-{}.csv".format(c),
            header=None,
            names=["user_id", "item_id", "time"],
        )
        dataset.fit_partial(click_train["user_id"], click_train["item_id"])
        num_users, num_items = dataset.interactions_shape()
        log('Num users: {}, num_items {}.'.format(num_users, num_items))
예제 #20
0
class DataFit:
    def __init__(self):
        self.dataset = None

    def fit(self):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        self.dataset = Dataset()
        self.dataset.fit(users=user_list,
                         items=book_list,
                         item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        interactions, weights = self.dataset.build_interactions(rating_list)

        book_features = DataPrep.create_features()
        books_features = self.dataset.build_item_features(book_features)
        return interactions, weights, books_features

    def create_new_interactions(self, checkpoint):
        rating_list = DataPrep.get_rating_list_from_checkpoint(checkpoint)
        interactions, weights = self.dataset.build_interactions(rating_list)
        return interactions, weights

    def get_user_mapping(self):
        user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping(
        )
        return user_id_map

    def get_book_mapping(self):
        user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping(
        )
        return item_id_map

    @staticmethod
    def fit_evaluate(test_percentage=0.1):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        dataset = Dataset()
        dataset.fit(users=user_list,
                    items=book_list,
                    item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        random.shuffle(rating_list)
        rating_list_test = rating_list[:int(test_percentage *
                                            len(rating_list))]
        rating_list_train = rating_list[int(test_percentage *
                                            len(rating_list)):]
        interactions_train, weights_train = dataset.build_interactions(
            rating_list_train)
        interactions_test, weights_test = dataset.build_interactions(
            rating_list_test)

        return interactions_train, weights_train, interactions_test, weights_test
예제 #21
0
def create_dataset(df, item_features, list_item_features):
    """
	function to create the dataset based on df which stores all the data including
	features (tags) of each products
	Args: df(pandas dataframe) - 
	"""
    ## create a mapping between the user and item ids from our input data
    #to indices that will be used internally by the model
    dataset = Dataset(item_identity_features=True)
    list_user_names = list(df.index)
    list_items = df.columns.values

    dataset.fit(
        (user_name for user_name in list_user_names),
        (item for item in list_items),
        item_features=(item_feature for item_feature in list_item_features))

    ## Build the interaction matrix
    # it encodes the interactions betwee users and items.
    # need (user, item) pair that has 1's in df
    list_pairs = list(df.stack().index)
    (interactions, weights) = dataset.build_interactions(
        (pair for pair in list_pairs))

    item_feature_matrix = dataset.build_item_features(item_features)

    return dataset, interactions, weights, item_feature_matrix
예제 #22
0
def fetch_data():
    # Create a SQL connection to our SQLite database
    con = sqlite3.connect("db.sqlite3")
    cur = con.cursor()

    # The result of a "cursor.execute" can be iterated over by row
    data = []
    users = []
    movies = []
    for row in cur.execute('SELECT id FROM RecoFramework_userinfo;'):
        users.append(row[0])

    for row in cur.execute('SELECT movieId FROM RecoFramework_movies;'):
        movies.append(row[0])

    for row in cur.execute(
            'SELECT userId, movieId, rating FROM RecoFramework_ratings WHERE rating = 5;'
    ):
        data.append(row)

    dataset = Dataset()
    #print("Loading dataset...")
    dataset.fit(users, movies)
    interactions, ratings = dataset.build_interactions(data)

    # Be sure to close the connection
    con.close()

    train, test = random_train_test_split(interactions)

    model = LightFM(loss='warp')

    # train lightFM model using fit method
    #print("Starting training the model...")
    model.fit(train, epochs=30, num_threads=2)

    user_dict = dataset._user_id_mapping
    movie_dict = dataset._item_id_mapping

    return model, ratings, user_dict, movie_dict, train, test
예제 #23
0
def train_model(df,
                user_id_col='user_id',
                item_id_col='business_id',
                item_name_col='name_business',
                evaluate=True):
    """Train the model using collaborative filtering.

    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.

    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as
            key and interaction_index as value.
        item_dict: item dictionary containing item_id
            as key and item_name as value.

    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')

    print('Training model...')
    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
    )
    (interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2])
                                                          for x in df.values])
    # model
    model_full = LightFM(no_components=100,
                         learning_rate=0.05,
                         loss='warp',
                         max_sampled=50)
    model_full.fit(interactions,
                   sample_weight=weights,
                   epochs=10,
                   num_threads=10)
    # mapping
    user_id_map, _, business_id_map, _ = ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, item_dict
예제 #24
0
 def __init__(self, dataset: Dataset) -> None:
     """
     userid: user_id
     row: internal user id
     itemid: recipe_id
     column: internal recipe id
     """
     userid2row, _, itemid2col, _ = dataset.mapping()
     self.userid2row = userid2row
     self.itemid2col = itemid2col
     # Invert dictionaries to get mapping in other direction
     self.row2userid = {
         value: key
         for key, value in self.userid2row.items()
     }
     self.col2itemid = {v: k for k, v in self.itemid2col.items()}
예제 #25
0
    def build_lightfm_dataset(self) -> None:
        """
        Builds final datasets for user-variant and variant-variant recommendations.
        """
        logging.info("Creating LightFM matrices...")
        lightfm_dataset = LFMDataset()
        ratings_list = self.interaction_list
        logging.info('#'*60)
        lightfm_dataset.fit_partial(
            (rating['user_id'] for rating in ratings_list),
            (rating['product_id'] for rating in ratings_list)
        )

        item_feature_names = self.item_df.columns
        logging.info(f'Logging item_feature_names - with product_id: \n{item_feature_names}')
        item_feature_names = item_feature_names[~item_feature_names.isin(['product_id'])]
        logging.info(f'Logging item_feature_names - without product_id: \n{item_feature_names}')

        for item_feature_name in item_feature_names:
            lightfm_dataset.fit_partial(
                items=(item['product_id'] for item in self.item_list),
                item_features=((item[item_feature_name] for item in self.item_list)),
            )

        item_features_data = []
        for item in self.item_list:
            item_features_data.append(
                (
                    item['product_id'],
                    [
                        item['product_name'],
                        item['aisle'],
                        item['department']
                    ],
                )
            )
        logging.info(f'Logging item_features_data @build_lightfm_dataset: \n{item_features_data}')
        self.item_features = lightfm_dataset.build_item_features(item_features_data)
        self.interactions, self.weights = lightfm_dataset.build_interactions(
            ((rating['user_id'], rating['product_id']) for rating in ratings_list)
        )

        self.n_users, self.n_items = self.interactions.shape

        logging.info(f'Logging self.interactions @build_lightfm_dataset: \n{self.interactions}')
        logging.info(f'Logging self.weights @build_lightfm_dataset: \n{self.weights}')
        logging.info(
            f'The shape of self.interactions {self.interactions.shape} '
            f'and self.weights {self.weights.shape} represent the user-item matrix.')
예제 #26
0
def evaluate_model(df,
                   user_id_col='user_id',
                   item_id_col='business_id',
                   stratify=None):
    """ Model evaluation.

    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        stratify: if use stratification.

    Returns:
        train_auc: training set auc score.
        test_auc: testing set auc score.

    """
    # model evaluation
    # create test and train datasets
    print('model evaluation')
    train, test = train_test_split(df, test_size=0.2, stratify=stratify)
    ds = Dataset()

    # we call fit to supply userid, item id and user/item features
    ds.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
    )

    # plugging in the interactions
    (train_interactions, train_weights) = ds.build_interactions([
        (x[0], x[1], x[2]) for x in train.values
    ])
    (test_interactions, _) = ds.build_interactions([(x[0], x[1], x[2])
                                                    for x in test.values])
    # model
    model = LightFM(no_components=100,
                    learning_rate=0.05,
                    loss='warp',
                    max_sampled=50)
    model.fit(train_interactions,
              sample_weight=train_weights,
              epochs=10,
              num_threads=10)

    # auc-roc
    train_auc = auc_score(model, train_interactions, num_threads=20).mean()
    print('Training set AUC: %s' % train_auc)
    test_auc = auc_score(model, test_interactions, num_threads=20).mean()
    print('Testing set AUC: %s' % test_auc)
def create_datasets(cluster_id):

    events_list = get_events_from_es(cluster_id)

    dataframe_interactions, dataframe_users_features, dataframe_item_features, user_tuple, item_tuple = create_interactions_and_features(events_list, cluster_id)

    print(dataframe_interactions, cluster_id, file=sys.stderr)
    print(dataframe_users_features, cluster_id, file=sys.stderr)
    print(dataframe_item_features, cluster_id, file=sys.stderr)

    #print(user_tuple)
   # print(item_tuple)

    user_features = format_users_features(dataframe_users_features)

    #print(user_features)

    item_features = format_items_features(dataframe_item_features)

    #print(item_features)

    dataset = Dataset()

    dataset.fit(
            dataframe_interactions['user'].unique(), # all the users
            dataframe_interactions['item'].unique(), # all the items
            user_features = user_features,
            item_features = item_features
    )

    (interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in dataframe_interactions.values ])

#    print(interactions)
#    print(weights)

    final_user_features = dataset.build_user_features(user_tuple, normalize= False)

    final_item_features = dataset.build_item_features(item_tuple, normalize= False)

    return dataset, interactions, weights, final_item_features, final_user_features
예제 #28
0
def predict(user_id: int) -> str:
    model_file = Path(BASE_DIR).joinpath(MODEL_FILE_NAME)
    data_file = Path(BASE_DIR).joinpath(DATA_FILE_NAME)

    if not model_file.exists():
        return None

    if not data_file.exists():
        return None

    model: LightFM = pickle.load(open(model_file, "rb"))
    data: pd.DataFrame = pd.read_csv(data_file)

    dataset = Dataset()

    dataset.fit((cac for cac in data.cac.unique()),
                (product for product in data.product_code.unique()))

    features = ['product_code', 'country_code', 'cost_bin']

    for product_feature in features:
        dataset.fit_partial(
            users=(cac for cac in data.cac.unique()),
            items=(product for product in data.product_code.unique()),
            item_features=(feature
                           for feature in data[product_feature].unique()))

    item_features = dataset.build_item_features(((getattr(row, 'product_code'), [getattr(row, product_feature) for product_feature in features if product_feature != 'product_code']) \
            for row in data[features].itertuples()))

    predicted_products: List[str] = sample_recommendation(
        model=model,
        dataset=dataset,
        raw_data=data,
        item_features=item_features,
        user_ids=user_id)

    return predicted_products
예제 #29
0
    def fit_evaluate(test_percentage=0.1):
        book_list = DataPrep.get_book_list()
        book_feature_list = DataPrep.get_feature_list()
        user_list = DataPrep.get_user_list()
        dataset = Dataset()
        dataset.fit(users=user_list,
                    items=book_list,
                    item_features=book_feature_list)

        rating_list = DataPrep.get_rating_list()
        random.shuffle(rating_list)
        rating_list_test = rating_list[:int(test_percentage *
                                            len(rating_list))]
        rating_list_train = rating_list[int(test_percentage *
                                            len(rating_list)):]
        interactions_train, weights_train = dataset.build_interactions(
            rating_list_train)
        interactions_test, weights_test = dataset.build_interactions(
            rating_list_test)

        return interactions_train, weights_train, interactions_test, weights_test
예제 #30
0
def train_model():
    dataset = Dataset()
    dataset.fit((x['User_ID'] for x in get_ratings()),
                (x['Item_ID'] for x in get_ratings()))
    for i in range(25):
        add_item_features(dataset, paan_features[i])
    (interactions, weights) = dataset.build_interactions(
        ((x['User_ID'], x['Item_ID']) for x in get_ratings()))

    item_features = dataset.build_item_features(((x['Item_ID'], [
        x['Banaras'], x['Calcutta'], x['Maghai'], x['Sada'], x['Meetha'],
        x['Chocolate'], x['Dry Fruit'], x['Mango'], x['Strawberry'],
        x['Pineapple'], x['Kaju'], x['Jelly'], x['Rose'], x['Shahi'],
        x['Kesar'], x['Vanilla'], x['Masala'], x['Khatta'], x['Orange'],
        x['White'], x['Silver'], x['RaatRani'], x['Nutella'], x['Special'],
        x['Gold']
    ]) for x in get_item_features()))

    model = LightFM(loss='bpr')
    model.fit(interactions, item_features=item_features)

    labels = np.array([x['Item_ID'] for x in get_item_features()])
    print("Model Trained Successfully.....")
    return model, interactions, labels, item_features
예제 #31
0
    user_stats_file = sys.argv[3]
    business_stats_file = sys.argv[4]

    print('[ %04ds ] Program started' % (time.time() - start_time))

    training_set: List[Review] = Review.load_from_file(training_set_file)
    user_stats: Dict[str, User] = User.load_from_file(user_stats_file)
    business_stats: Dict[str, Business] = Business.load_from_file(
        business_stats_file)

    print('[ %04ds ] Files loaded' % (time.time() - start_time))

    all_user_features = ['NO_FEAT']
    all_business_features = Business.collect_business_features(business_stats)

    dataset = Dataset()
    dataset.fit(User.extract_user_ids(user_stats),
                Business.extract_business_ids(business_stats),
                user_features=all_user_features,
                item_features=all_business_features)

    user_features = dataset.build_user_features(
        User.build_user_features(user_stats,
                                 User.extract_user_ids(user_stats)), True)

    business_features = dataset.build_item_features(
        Business.build_business_features(
            business_stats, Business.extract_business_ids(business_stats)),
        True)

    print('[ %04ds ] Dataset initialized' % (time.time() - start_time))
예제 #32
0
    def run(self,
            epochs: int = 1,
            no_components: int = 50,
            learning_rate: float = 0.05) -> Dict[str, float]:
        """
         build interaction matrix -> build movie features -> build model

        Example (5000 samples, 50 components, 5 epochs, learning_rate=0.05)
        =================================
        {'auc_train': 0.66268414, 'auc_test': 0.67257625,
         'precision_train@10': 0.035984848, 'precision_test@10': 0.014193548,
         'recall_train@10': 0.06827082513973247, 'recall_test@10': 0.0646373101211811}

        ###########################
        #### Random Stratified ####
        ###########################
        Example (2 million samples, 50 components, 1 epochs, learning_rate=0.05)
        =================================
        {'auc_train': 0.5171841, 'auc_test': 0.51610065,
         'precision_train@10': 0.018248174, 'precision_test@10': 0.0040145987,
         'recall_train@10': 0.0008001067196610589, 'recall_t0.018248174est@10': 0.0007001527280332769}

        ########################
        #### Popular Active ####
        ########################
        Example (333000 samples, 150 components, 1 epochs, learning_rate=0.05)  20% test data
        =================================
        {'auc_train': 0.63388383, 'auc_test': 0.5569484,
        'precision_train@10': 0.7255412, 'precision_test@10': 0.17099567,
        'recall_train@10': 0.006322884137545113, 'recall_test@10': 0.006053869700910709}

        Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05)  40% test data
        =================================
        {'auc_train': 0.6001097, 'auc_test': 0.56429684,
         'precision_train@10': 0.56060606, 'precision_test@10': 0.33030304,
         'recall_train@10': 0.006517918240037026, 'recall_test@10': 0.005792534657980192}

        Example (333000 samples, 50 components, 20 epochs, learning_rate=0.05)  40% test data
        =================================
        {'auc_train': 0.6077434, 'auc_test': 0.5688331,
         'precision_train@10': 0.5874459, 'precision_test@10': 0.32424247,
         'recall_train@10': 0.0068082500065638684, 'recall_test@10': 0.005756504594433489}

        Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05)  40% test data with normalization
        =================================
        {'auc_train': 0.60080063, 'auc_test': 0.56425303,
         'precision_train@10': 0.56926405, 'precision_test@10': 0.33679655,
         'recall_train@10': 0.006628036812872702, 'recall_test@10': 0.005913302996971047}
         """
        ## Build Matrix Factorization between Customer and Movie
        data = self._filter_data

        dataset = Dataset()
        dataset.fit(data['Cust_Id'].unique(),
                    data['Movie_Id'].unique(),
                    item_features=self.get_combination)
        (interactions, weights) = dataset.build_interactions([
            (x['Cust_Id'], x['Movie_Id'], x['Rating'])
            for index, x in data.iterrows()
        ])

        train, test = random_train_test_split(
            interactions,
            test_percentage=0.4,
            random_state=np.random.RandomState(7))
        print("Finished creating interactions matrix!")

        ## Build movie features
        movies_id, tfidf_data = self.get_tfidf_matrix
        features_lists = [list(x) for x in tfidf_data.values]
        movies_features = dataset.build_item_features(
            data=self.get_movies_tuple(features_lists, movies_id, tfidf_data),
            normalize=True)
        print("Finished building movie features!")

        ## Build model
        model = LightFM(no_components=no_components,
                        learning_rate=learning_rate,
                        loss='warp',
                        k=15)
        model.fit(train,
                  epochs=epochs,
                  item_features=movies_features,
                  num_threads=4)
        print("Finished building LightFM model!")

        with open('hybrid_model_popular_active.pickle', 'wb') as fle:
            pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Finished saving LightFM model!")

        return {
            "auc_train":
            auc_score(model, train, item_features=movies_features).mean(),
            "auc_test":
            auc_score(model, test, item_features=movies_features).mean(),
            "precision_train@10":
            precision_at_k(model, train, item_features=movies_features,
                           k=10).mean(),
            "precision_test@10":
            precision_at_k(model, test, item_features=movies_features,
                           k=10).mean(),
            "recall_train@10":
            recall_at_k(model, train, item_features=movies_features,
                        k=10).mean(),
            "recall_test@10":
            recall_at_k(model, test, item_features=movies_features,
                        k=10).mean()
        }
예제 #33
0
def train_model(
               df, user_id_col='user_id', item_id_col='business_id',
               item_name_col='name_business', evaluate=True):
    """ Train the model using collaborative filtering.
    Args:
        df: the input dataframe.
        user_id_col: user id column.
        item_id_col: item id column.
        item_name_col: item name column.
        evaluate: if evaluate the model performance.
    Returns:
        model_full: the trained model.
        df_interactions: dataframe with user-item interactions.
        user_dict: user dictionary containing user_id as key and
            interaction_index as value.
        item_dict: item dictionary containing item_id as key and
            item_name as value.
        user_feature_map: the feature map of users
        business_feature_map: the feature map of items
    """
    if evaluate:
        print('Evaluating model...')
        evaluate_model(df, user_id_col='user_id', item_id_col='business_id')
    print('Training model...')

    # build recommendations for known users and known businesses
    # with collaborative filtering method
    ds_full = Dataset()
    # we call fit to supply userid, item id and user/item features
    user_cols = ['user_id', 'average_stars']
    categories = [c for c in df.columns if c[0].isupper()]
    item_cols = ['business_id', 'state']

    for i in df.columns[10:]:
        item_cols.append(str(i))

    user_features = user_cols[1:]
    item_features = item_cols[2:]

    ds_full.fit(
        df[user_id_col].unique(),  # all the users
        df[item_id_col].unique(),  # all the items
        user_features=user_features,  # additional user features
        item_features=item_features
         )

    df_users = df.drop_duplicates(user_id_col)
    # df_users = df[df.duplicated(user_id_col) == False]
    users_features = []
    for i in range(len(df_users)):
        users_features.append(get_users_features_tuple(df_users.values[i]))
    users_features = ds_full.build_user_features(
        users_features, normalize=False)

    items = df.drop_duplicates(item_id_col)
    # items = df[df.duplicated(item_id_col) == False]
    items_features = []
    for i in range(len(items)):
        items_features.append(get_items_features_tuple(
            items.values[i], categories))
    items_features = ds_full.build_item_features(
        items_features, normalize=False)

    (interactions, weights) = ds_full.build_interactions(
        [(x[0], x[1], x[2]) for x in df.values])
    # model
    model_full = LightFM(
        no_components=100, learning_rate=0.05, loss='warp', max_sampled=50)
    model_full.fit(
        interactions, user_features=users_features,
        item_features=items_features, sample_weight=weights,
        epochs=10, num_threads=10)
    # mapping
    user_id_map, user_feature_map, business_id_map, business_feature_map = \
        ds_full.mapping()

    # data preparation
    df_interactions = pd.DataFrame(weights.todense())
    df_interactions.index = list(user_id_map.keys())
    df_interactions.columns = list(business_id_map.keys())
    user_dict = user_id_map
    item_dict = df.set_index(item_id_col)[item_name_col].to_dict()
    return model_full, df_interactions, user_dict, \
        item_dict, user_feature_map, business_feature_map
예제 #34
0
def test_exceptions():

    users, items = 10, 100

    dataset = Dataset()
    dataset.fit(range(users), range(items))

    with pytest.raises(ValueError):
        dataset.build_interactions([(users + 1, 0)])

    with pytest.raises(ValueError):
        dataset.build_interactions([(0, items + 1)])

    dataset.fit_partial([users + 1], [items + 1])
    dataset.build_interactions([(users + 1, 0)])
    dataset.build_interactions([(0, items + 1)])