def peuimportelenom(): noms= request.form.getlist("dblst_artists") sugg= [] #print(noms) for el in noms: artiste= ap[ap.name== el] lind= list(artiste.artistID)[0] -1 vecteur[lind]= artiste.playCountScaled.median() # création de la matrice X= np.vstack((ratings,vecteur)) # On importe le code du jupyter notebook n_users, n_items = X.shape Xcsr = csr_matrix(X) Xcoo = Xcsr.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) scores = model.predict(0, vecteur) top_items = ap["name"].unique()[np.argsort(-scores)] sugg= top_items[:10] return render_template("page.html", artist_names= artist_names, noms= noms, sugg= sugg)
def build_id_mappings(self, hybrid=False) -> Dataset: """Builds internal indice mapping for user-item interactions and encodes item features. Reads in user-item interactions and the features associated with each item and builds a mapping between the user and item ids from our input data to indices that will be used internally by our model. Item features are further encoded as an argument passed to Dataset.fit. These are supplied as a flat list of unique item features for the entire dataset. Args: df_interactions (pandas.DataFrame): User-Item interactions DataFrame consisting of user and item IDs. df_item_features (pandas.DataFrame): Item IDs and their corresponding features as column separated values. Returns: lightfm.data.Dataset: Tool for building interaction and feature matrices, taking care of the mapping between user/item ids and feature names and internal feature indices. tag_sector (list): list of all the unique cashtag sector information in the dataset. tag_industry (list): list of all the unique cashtag industries information in the dataset. :param hybrid: """ dataset = Dataset() dataset.fit( (x for x in self.df['user_id']), (x for x in self.df['tag_id']), item_features=(x for x in self.df['tag_sector']) if hybrid else None) return dataset
def create_dataset(users, movies): dataset = Dataset() dataset.fit( users=[x["id"] for x in users], items=[x["id"] for x in movies], item_features=create_movie_features_set(movies), ) return dataset
def prepareData(df, tags): df = df[df.actionCategory == "WebNei clicked"] actionByUsers = df.groupby(["userName", "actionName"]).size() uniqueUsers = df[df.userName.isin( actionByUsers.index.get_level_values( 0).unique().values)].drop_duplicates('userName') uniqueUsers['user_features'] = uniqueUsers[[ 'title', 'team', 'organization', 'department' ]].values.tolist() dataset = Dataset() dataset.fit((list(actionByUsers.index.get_level_values(0))), (list(actionByUsers.index.get_level_values(1)))) rowM, colM = prepareJson(tags) rowU, colU = prepareUserFeatures(uniqueUsers) dataset.fit_partial(items=rowM, item_features=colM, users=rowU, user_features=colU) (interactions, weights) = dataset.build_interactions( zip(list(actionByUsers.index.get_level_values(0)), list(actionByUsers.index.get_level_values(1)))) item_features = dataset.build_item_features(zip(rowM, [colM])) user_features = dataset.build_user_features(zip(rowU, [colU])) return interactions, item_features, user_features
def test_fitting_no_identity(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, 0) assert dataset.item_features_shape() == (items, 0) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([], normalize=False).getnnz() == 0 assert dataset.build_item_features([], normalize=False).getnnz() == 0
def create_recommender(): # obtain interaction table from dynamodb, which is json data dynamodb = boto3.resource('dynamodb') table = dynamodb.Table('eye_video_vote') response = table.scan() raw_data = response['Items'] #transform json structure data to user-item-rating interaction format final_df = pd.DataFrame(columns=['userId','videoId','rating']) for i in raw_data: # data = raw_data[i] if any('upVote' in s for s in list(i.keys())): df1 = {k:i[k] for k in ('upVote','videoId')} df1['videoId'] = {df1['videoId']} df1 = pd.DataFrame.from_dict(df1, orient='index').T df1['rating'] = randint(4, 5) df1.fillna(value = pd.np.nan, inplace=True) df1 = df1.fillna(method='ffill') df1.rename(columns={'upVote':'userId'},inplace=True) final_df = final_df.append(df1) if any('downVote' in s for s in list(i.keys())): df2 = {k:i[k] for k in ('downVote','videoId')} df2['videoId'] = {df2['videoId']} df2 = pd.DataFrame.from_dict(df2, orient='index').T df2['rating'] = randint(1, 2) df2.fillna(value = pd.np.nan, inplace=True) df2 = df2.fillna(method='ffill') df2.rename(columns={'downVote':'userId'},inplace=True) final_df = final_df.append(df2) #rename the columns final_df.rename(columns={'userId':'UserID', 'videoId':'MovieID', 'rating':'rating'}, inplace=True) #generate the appropriate lightfm dataset dataset = Dataset() dataset.fit(users = (row['UserID'] for index,row in final_df.iterrows()), items = (row['MovieID'] for index,row in final_df.iterrows())) (interactions, weights) = dataset.build_interactions((row['UserID'],row['MovieID'],row['rating']) for index,row in final_df.iterrows()) #model collabrative filtering model_cf = LightFM(no_components=20, loss='warp') model_cf.fit(interactions, user_features=None, item_features=None, sample_weight=None, epochs=20, num_threads=4) with open('model_cf.pickle', 'wb') as fle: pickle.dump(model_cf, fle, protocol=pickle.HIGHEST_PROTOCOL) return
def test_fitting(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, users) assert dataset.item_features_shape() == (items, items) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([]).getnnz() == users assert dataset.build_item_features([]).getnnz() == items
def test_build_features(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit( range(users), range(items), ["user:{}".format(x) for x in range(users)], ["item:{}".format(x) for x in range(items)], ) # Build from lists user_features = dataset.build_user_features( [ (user_id, ["user:{}".format(x) for x in range(users)]) for user_id in range(users) ] ) assert user_features.getnnz() == users ** 2 item_features = dataset.build_item_features( [ (item_id, ["item:{}".format(x) for x in range(items)]) for item_id in range(items) ] ) assert item_features.getnnz() == items ** 2 # Build from dicts user_features = dataset.build_user_features( [ (user_id, {"user:{}".format(x): float(x) for x in range(users)}) for user_id in range(users) ], normalize=False, ) assert np.all(user_features.todense() == np.array([list(range(users))] * users)) item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ], normalize=False, ) assert np.all(item_features.todense() == np.array([list(range(items))] * items)) # Test normalization item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ] ) assert np.all(item_features.sum(1) == 1.0)
def predict_artist_list(artist_select): # Build a user-artist rating matrix ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled') ratings = ratings_df.fillna(0).values artist_names = ap.sort_values("artistID")["name"].unique() add_user = [0]*17632 new_list = [] for item in artist_select: artists_idx = artists.index[artists["name"] == item] new_list.append(artists_idx) for i in new_list : for j in i : index = j add_user[index] = 1 new_ratings_df = np.vstack((ratings_df, add_user)) ratings_df = pd.DataFrame(new_ratings_df) new_userID = (ratings_df.shape[0] - 1) ratings = ratings_df.fillna(0).values # Build a sparse matrix X = csr_matrix(ratings) n_users, n_items = ratings_df.shape user_ids = ratings_df.index.values artist_names = ap.sort_values("artistID")["name"].unique() # Build data references + train test Xcoo = X.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) # Predict scores = model.predict(0, np.arange(n_items)) top_items = artist_names[np.argsort(-scores)] return top_items[0:10]
def init_lightfm_dataset(unique_elements, user_features=None, movie_features=None): unique_users = unique_elements[0] unique_movies = unique_elements[1] if (user_features is not None): user_features = [*user_features[0][1]] if (movie_features is not None): movie_features = [*movie_features[0][1]] dataset = Dataset() dataset.fit(users=unique_users, items=unique_movies, user_features=user_features, item_features=movie_features) return (dataset)
def fit(self): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() self.dataset = Dataset() self.dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() interactions, weights = self.dataset.build_interactions(rating_list) book_features = DataPrep.create_features() books_features = self.dataset.build_item_features(book_features) return interactions, weights, books_features
def test_build_features(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit( range(users), range(items), ["user:{}".format(x) for x in range(users)], ["item:{}".format(x) for x in range(items)], ) # Build from lists user_features = dataset.build_user_features( [(user_id, ["user:{}".format(x) for x in range(users)]) for user_id in range(users)] ) assert user_features.getnnz() == users ** 2 item_features = dataset.build_item_features( [(item_id, ["item:{}".format(x) for x in range(items)]) for item_id in range(items)] ) assert item_features.getnnz() == items ** 2 # Build from dicts user_features = dataset.build_user_features( [ (user_id, {"user:{}".format(x): float(x) for x in range(users)}) for user_id in range(users) ], normalize=False, ) assert np.all(user_features.todense() == np.array([list(range(users))] * users)) item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ], normalize=False, ) assert np.all(item_features.todense() == np.array([list(range(items))] * items)) # Test normalization item_features = dataset.build_item_features( [ (item_id, {"item:{}".format(x): float(x) for x in range(items)}) for item_id in range(items) ] ) assert np.all(item_features.sum(1) == 1.0)
def create_dataset(df): ## create a mapping between the user and item ids from our input data #to indices that will be used internally by the model dataset = Dataset() list_user_names = list(df.index) list_items = df.columns.values dataset.fit((user_name for user_name in list_user_names), (item for item in list_items)) ## Build the interaction matrix # it encodes the interactions betwee users and items. # need (user, item) pair that has 1's in df list_pairs = list(df.stack().index) (interactions, weights) = dataset.build_interactions( (pair for pair in list_pairs)) return dataset, interactions, weights
def obtener_matrices(self): """ Método obtener_matrices. Obtiene las matrices necesarias para la creación de los modelos de LightFM. Este método solo se utiliza en la interfaz de texto. """ global train, test, modelo, item_features, user_features # Se obtienen los dataframes Entrada.obtener_datos() ratings_df = Entrada.ratings_df users_df = Entrada.users_df items_df = Entrada.items_df # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos dataset = Dataset() dataset.fit(users_df[users_df.columns.values[0]], items_df[items_df.columns.values[0]], user_features=users_df[users_df.columns.values[1]], item_features=items_df[items_df.columns.values[1]]) # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios if self.opcion_modelo == 1 or self.opcion_modelo == 2: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]], row[ratings_df.columns.values[2]]) for index, row in ratings_df.iterrows()) else: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]]) for index, row in ratings_df.iterrows()) # Se obtienen las matrices de features y se guardan item_features = dataset.build_item_features( (row[items_df.columns.values[0]], [row[items_df.columns.values[1]]]) for index, row in items_df.iterrows()) user_features = dataset.build_user_features( (row[users_df.columns.values[0]], [row[users_df.columns.values[1]]]) for index, row in users_df.iterrows()) print("Guarda la matriz de item features") guardar_datos_pickle(item_features, 'la matriz de item features') print("Guarda la matriz de user features") guardar_datos_pickle(user_features, 'la matriz de user feautures') # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan train, test = random_train_test_split(interacciones, test_percentage=0.2) print("Guarda la matriz de entrenamiento") guardar_datos_pickle(train, 'la matriz de entrenamiento') print("Guarda la matriz de test") guardar_datos_pickle(test, 'la matriz de test')
def interactions(df): movie_genre = [x.split("|") for x in df["genre"]] all_movie_genre = sorted( list(set(itertools.chain.from_iterable(movie_genre)))) all_occupations = sorted(list(set(df["occupation"]))) dataset = Dataset() dataset.fit( df["userID"], df["itemID"], item_features=all_movie_genre, user_features=all_occupations, ) item_features = dataset.build_item_features( (x, y) for x, y in zip(df.itemID, movie_genre)) user_features = dataset.build_user_features( (x, [y]) for x, y in zip(df.userID, df["occupation"])) (interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values) train_interactions, test_interactions = cross_validation.random_train_test_split( interactions, test_percentage=TEST_PERCENTAGE, random_state=np.random.RandomState(SEEDNO), ) return train_interactions, test_interactions, item_features, user_features
def lightfm_trainer(train: np.ndarray, loss: str, n_components: int, lam: float) -> None: """Train lightfm models.""" model = LightFM( loss=loss, user_alpha=lam, item_alpha=lam, no_components=n_components, learning_rate=0.001, random_state=12345, ) dataset = Dataset() dataset.fit(train[:, 0], train[:, 1]) (interactions, weights) = dataset.build_interactions( ((x[0], x[1], 1) for x in train[train[:, 2] == 1])) model.fit(interactions, epochs=100) return model
def main(): current_stage = 6 model = LightFM(no_components=30) dataset = Dataset() for c in range(0, current_stage + 1): click_train = pd.read_csv( train_path + "/underexpose_train_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) click_test = pd.read_csv( test_path + "/underexpose_test_click-{}.csv".format(c), header=None, names=["user_id", "item_id", "time"], ) dataset.fit_partial(click_train["user_id"], click_train["item_id"]) num_users, num_items = dataset.interactions_shape() log('Num users: {}, num_items {}.'.format(num_users, num_items))
class DataFit: def __init__(self): self.dataset = None def fit(self): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() self.dataset = Dataset() self.dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() interactions, weights = self.dataset.build_interactions(rating_list) book_features = DataPrep.create_features() books_features = self.dataset.build_item_features(book_features) return interactions, weights, books_features def create_new_interactions(self, checkpoint): rating_list = DataPrep.get_rating_list_from_checkpoint(checkpoint) interactions, weights = self.dataset.build_interactions(rating_list) return interactions, weights def get_user_mapping(self): user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping( ) return user_id_map def get_book_mapping(self): user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping( ) return item_id_map @staticmethod def fit_evaluate(test_percentage=0.1): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() dataset = Dataset() dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() random.shuffle(rating_list) rating_list_test = rating_list[:int(test_percentage * len(rating_list))] rating_list_train = rating_list[int(test_percentage * len(rating_list)):] interactions_train, weights_train = dataset.build_interactions( rating_list_train) interactions_test, weights_test = dataset.build_interactions( rating_list_test) return interactions_train, weights_train, interactions_test, weights_test
def create_dataset(df, item_features, list_item_features): """ function to create the dataset based on df which stores all the data including features (tags) of each products Args: df(pandas dataframe) - """ ## create a mapping between the user and item ids from our input data #to indices that will be used internally by the model dataset = Dataset(item_identity_features=True) list_user_names = list(df.index) list_items = df.columns.values dataset.fit( (user_name for user_name in list_user_names), (item for item in list_items), item_features=(item_feature for item_feature in list_item_features)) ## Build the interaction matrix # it encodes the interactions betwee users and items. # need (user, item) pair that has 1's in df list_pairs = list(df.stack().index) (interactions, weights) = dataset.build_interactions( (pair for pair in list_pairs)) item_feature_matrix = dataset.build_item_features(item_features) return dataset, interactions, weights, item_feature_matrix
def fetch_data(): # Create a SQL connection to our SQLite database con = sqlite3.connect("db.sqlite3") cur = con.cursor() # The result of a "cursor.execute" can be iterated over by row data = [] users = [] movies = [] for row in cur.execute('SELECT id FROM RecoFramework_userinfo;'): users.append(row[0]) for row in cur.execute('SELECT movieId FROM RecoFramework_movies;'): movies.append(row[0]) for row in cur.execute( 'SELECT userId, movieId, rating FROM RecoFramework_ratings WHERE rating = 5;' ): data.append(row) dataset = Dataset() #print("Loading dataset...") dataset.fit(users, movies) interactions, ratings = dataset.build_interactions(data) # Be sure to close the connection con.close() train, test = random_train_test_split(interactions) model = LightFM(loss='warp') # train lightFM model using fit method #print("Starting training the model...") model.fit(train, epochs=30, num_threads=2) user_dict = dataset._user_id_mapping movie_dict = dataset._item_id_mapping return model, ratings, user_dict, movie_dict, train, test
def train_model(df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items ) (interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM(no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit(interactions, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, _, business_id_map, _ = ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, item_dict
def __init__(self, dataset: Dataset) -> None: """ userid: user_id row: internal user id itemid: recipe_id column: internal recipe id """ userid2row, _, itemid2col, _ = dataset.mapping() self.userid2row = userid2row self.itemid2col = itemid2col # Invert dictionaries to get mapping in other direction self.row2userid = { value: key for key, value in self.userid2row.items() } self.col2itemid = {v: k for k, v in self.itemid2col.items()}
def build_lightfm_dataset(self) -> None: """ Builds final datasets for user-variant and variant-variant recommendations. """ logging.info("Creating LightFM matrices...") lightfm_dataset = LFMDataset() ratings_list = self.interaction_list logging.info('#'*60) lightfm_dataset.fit_partial( (rating['user_id'] for rating in ratings_list), (rating['product_id'] for rating in ratings_list) ) item_feature_names = self.item_df.columns logging.info(f'Logging item_feature_names - with product_id: \n{item_feature_names}') item_feature_names = item_feature_names[~item_feature_names.isin(['product_id'])] logging.info(f'Logging item_feature_names - without product_id: \n{item_feature_names}') for item_feature_name in item_feature_names: lightfm_dataset.fit_partial( items=(item['product_id'] for item in self.item_list), item_features=((item[item_feature_name] for item in self.item_list)), ) item_features_data = [] for item in self.item_list: item_features_data.append( ( item['product_id'], [ item['product_name'], item['aisle'], item['department'] ], ) ) logging.info(f'Logging item_features_data @build_lightfm_dataset: \n{item_features_data}') self.item_features = lightfm_dataset.build_item_features(item_features_data) self.interactions, self.weights = lightfm_dataset.build_interactions( ((rating['user_id'], rating['product_id']) for rating in ratings_list) ) self.n_users, self.n_items = self.interactions.shape logging.info(f'Logging self.interactions @build_lightfm_dataset: \n{self.interactions}') logging.info(f'Logging self.weights @build_lightfm_dataset: \n{self.weights}') logging.info( f'The shape of self.interactions {self.interactions.shape} ' f'and self.weights {self.weights.shape} represent the user-item matrix.')
def evaluate_model(df, user_id_col='user_id', item_id_col='business_id', stratify=None): """ Model evaluation. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. stratify: if use stratification. Returns: train_auc: training set auc score. test_auc: testing set auc score. """ # model evaluation # create test and train datasets print('model evaluation') train, test = train_test_split(df, test_size=0.2, stratify=stratify) ds = Dataset() # we call fit to supply userid, item id and user/item features ds.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items ) # plugging in the interactions (train_interactions, train_weights) = ds.build_interactions([ (x[0], x[1], x[2]) for x in train.values ]) (test_interactions, _) = ds.build_interactions([(x[0], x[1], x[2]) for x in test.values]) # model model = LightFM(no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model.fit(train_interactions, sample_weight=train_weights, epochs=10, num_threads=10) # auc-roc train_auc = auc_score(model, train_interactions, num_threads=20).mean() print('Training set AUC: %s' % train_auc) test_auc = auc_score(model, test_interactions, num_threads=20).mean() print('Testing set AUC: %s' % test_auc)
def create_datasets(cluster_id): events_list = get_events_from_es(cluster_id) dataframe_interactions, dataframe_users_features, dataframe_item_features, user_tuple, item_tuple = create_interactions_and_features(events_list, cluster_id) print(dataframe_interactions, cluster_id, file=sys.stderr) print(dataframe_users_features, cluster_id, file=sys.stderr) print(dataframe_item_features, cluster_id, file=sys.stderr) #print(user_tuple) # print(item_tuple) user_features = format_users_features(dataframe_users_features) #print(user_features) item_features = format_items_features(dataframe_item_features) #print(item_features) dataset = Dataset() dataset.fit( dataframe_interactions['user'].unique(), # all the users dataframe_interactions['item'].unique(), # all the items user_features = user_features, item_features = item_features ) (interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in dataframe_interactions.values ]) # print(interactions) # print(weights) final_user_features = dataset.build_user_features(user_tuple, normalize= False) final_item_features = dataset.build_item_features(item_tuple, normalize= False) return dataset, interactions, weights, final_item_features, final_user_features
def predict(user_id: int) -> str: model_file = Path(BASE_DIR).joinpath(MODEL_FILE_NAME) data_file = Path(BASE_DIR).joinpath(DATA_FILE_NAME) if not model_file.exists(): return None if not data_file.exists(): return None model: LightFM = pickle.load(open(model_file, "rb")) data: pd.DataFrame = pd.read_csv(data_file) dataset = Dataset() dataset.fit((cac for cac in data.cac.unique()), (product for product in data.product_code.unique())) features = ['product_code', 'country_code', 'cost_bin'] for product_feature in features: dataset.fit_partial( users=(cac for cac in data.cac.unique()), items=(product for product in data.product_code.unique()), item_features=(feature for feature in data[product_feature].unique())) item_features = dataset.build_item_features(((getattr(row, 'product_code'), [getattr(row, product_feature) for product_feature in features if product_feature != 'product_code']) \ for row in data[features].itertuples())) predicted_products: List[str] = sample_recommendation( model=model, dataset=dataset, raw_data=data, item_features=item_features, user_ids=user_id) return predicted_products
def fit_evaluate(test_percentage=0.1): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() dataset = Dataset() dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() random.shuffle(rating_list) rating_list_test = rating_list[:int(test_percentage * len(rating_list))] rating_list_train = rating_list[int(test_percentage * len(rating_list)):] interactions_train, weights_train = dataset.build_interactions( rating_list_train) interactions_test, weights_test = dataset.build_interactions( rating_list_test) return interactions_train, weights_train, interactions_test, weights_test
def train_model(): dataset = Dataset() dataset.fit((x['User_ID'] for x in get_ratings()), (x['Item_ID'] for x in get_ratings())) for i in range(25): add_item_features(dataset, paan_features[i]) (interactions, weights) = dataset.build_interactions( ((x['User_ID'], x['Item_ID']) for x in get_ratings())) item_features = dataset.build_item_features(((x['Item_ID'], [ x['Banaras'], x['Calcutta'], x['Maghai'], x['Sada'], x['Meetha'], x['Chocolate'], x['Dry Fruit'], x['Mango'], x['Strawberry'], x['Pineapple'], x['Kaju'], x['Jelly'], x['Rose'], x['Shahi'], x['Kesar'], x['Vanilla'], x['Masala'], x['Khatta'], x['Orange'], x['White'], x['Silver'], x['RaatRani'], x['Nutella'], x['Special'], x['Gold'] ]) for x in get_item_features())) model = LightFM(loss='bpr') model.fit(interactions, item_features=item_features) labels = np.array([x['Item_ID'] for x in get_item_features()]) print("Model Trained Successfully.....") return model, interactions, labels, item_features
user_stats_file = sys.argv[3] business_stats_file = sys.argv[4] print('[ %04ds ] Program started' % (time.time() - start_time)) training_set: List[Review] = Review.load_from_file(training_set_file) user_stats: Dict[str, User] = User.load_from_file(user_stats_file) business_stats: Dict[str, Business] = Business.load_from_file( business_stats_file) print('[ %04ds ] Files loaded' % (time.time() - start_time)) all_user_features = ['NO_FEAT'] all_business_features = Business.collect_business_features(business_stats) dataset = Dataset() dataset.fit(User.extract_user_ids(user_stats), Business.extract_business_ids(business_stats), user_features=all_user_features, item_features=all_business_features) user_features = dataset.build_user_features( User.build_user_features(user_stats, User.extract_user_ids(user_stats)), True) business_features = dataset.build_item_features( Business.build_business_features( business_stats, Business.extract_business_ids(business_stats)), True) print('[ %04ds ] Dataset initialized' % (time.time() - start_time))
def run(self, epochs: int = 1, no_components: int = 50, learning_rate: float = 0.05) -> Dict[str, float]: """ build interaction matrix -> build movie features -> build model Example (5000 samples, 50 components, 5 epochs, learning_rate=0.05) ================================= {'auc_train': 0.66268414, 'auc_test': 0.67257625, 'precision_train@10': 0.035984848, 'precision_test@10': 0.014193548, 'recall_train@10': 0.06827082513973247, 'recall_test@10': 0.0646373101211811} ########################### #### Random Stratified #### ########################### Example (2 million samples, 50 components, 1 epochs, learning_rate=0.05) ================================= {'auc_train': 0.5171841, 'auc_test': 0.51610065, 'precision_train@10': 0.018248174, 'precision_test@10': 0.0040145987, 'recall_train@10': 0.0008001067196610589, 'recall_t0.018248174est@10': 0.0007001527280332769} ######################## #### Popular Active #### ######################## Example (333000 samples, 150 components, 1 epochs, learning_rate=0.05) 20% test data ================================= {'auc_train': 0.63388383, 'auc_test': 0.5569484, 'precision_train@10': 0.7255412, 'precision_test@10': 0.17099567, 'recall_train@10': 0.006322884137545113, 'recall_test@10': 0.006053869700910709} Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05) 40% test data ================================= {'auc_train': 0.6001097, 'auc_test': 0.56429684, 'precision_train@10': 0.56060606, 'precision_test@10': 0.33030304, 'recall_train@10': 0.006517918240037026, 'recall_test@10': 0.005792534657980192} Example (333000 samples, 50 components, 20 epochs, learning_rate=0.05) 40% test data ================================= {'auc_train': 0.6077434, 'auc_test': 0.5688331, 'precision_train@10': 0.5874459, 'precision_test@10': 0.32424247, 'recall_train@10': 0.0068082500065638684, 'recall_test@10': 0.005756504594433489} Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05) 40% test data with normalization ================================= {'auc_train': 0.60080063, 'auc_test': 0.56425303, 'precision_train@10': 0.56926405, 'precision_test@10': 0.33679655, 'recall_train@10': 0.006628036812872702, 'recall_test@10': 0.005913302996971047} """ ## Build Matrix Factorization between Customer and Movie data = self._filter_data dataset = Dataset() dataset.fit(data['Cust_Id'].unique(), data['Movie_Id'].unique(), item_features=self.get_combination) (interactions, weights) = dataset.build_interactions([ (x['Cust_Id'], x['Movie_Id'], x['Rating']) for index, x in data.iterrows() ]) train, test = random_train_test_split( interactions, test_percentage=0.4, random_state=np.random.RandomState(7)) print("Finished creating interactions matrix!") ## Build movie features movies_id, tfidf_data = self.get_tfidf_matrix features_lists = [list(x) for x in tfidf_data.values] movies_features = dataset.build_item_features( data=self.get_movies_tuple(features_lists, movies_id, tfidf_data), normalize=True) print("Finished building movie features!") ## Build model model = LightFM(no_components=no_components, learning_rate=learning_rate, loss='warp', k=15) model.fit(train, epochs=epochs, item_features=movies_features, num_threads=4) print("Finished building LightFM model!") with open('hybrid_model_popular_active.pickle', 'wb') as fle: pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL) print("Finished saving LightFM model!") return { "auc_train": auc_score(model, train, item_features=movies_features).mean(), "auc_test": auc_score(model, test, item_features=movies_features).mean(), "precision_train@10": precision_at_k(model, train, item_features=movies_features, k=10).mean(), "precision_test@10": precision_at_k(model, test, item_features=movies_features, k=10).mean(), "recall_train@10": recall_at_k(model, train, item_features=movies_features, k=10).mean(), "recall_test@10": recall_at_k(model, test, item_features=movies_features, k=10).mean() }
def train_model( df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """ Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. user_feature_map: the feature map of users business_feature_map: the feature map of items """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features user_cols = ['user_id', 'average_stars'] categories = [c for c in df.columns if c[0].isupper()] item_cols = ['business_id', 'state'] for i in df.columns[10:]: item_cols.append(str(i)) user_features = user_cols[1:] item_features = item_cols[2:] ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items user_features=user_features, # additional user features item_features=item_features ) df_users = df.drop_duplicates(user_id_col) # df_users = df[df.duplicated(user_id_col) == False] users_features = [] for i in range(len(df_users)): users_features.append(get_users_features_tuple(df_users.values[i])) users_features = ds_full.build_user_features( users_features, normalize=False) items = df.drop_duplicates(item_id_col) # items = df[df.duplicated(item_id_col) == False] items_features = [] for i in range(len(items)): items_features.append(get_items_features_tuple( items.values[i], categories)) items_features = ds_full.build_item_features( items_features, normalize=False) (interactions, weights) = ds_full.build_interactions( [(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM( no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit( interactions, user_features=users_features, item_features=items_features, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, user_feature_map, business_id_map, business_feature_map = \ ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, \ item_dict, user_feature_map, business_feature_map
def test_exceptions(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) with pytest.raises(ValueError): dataset.build_interactions([(users + 1, 0)]) with pytest.raises(ValueError): dataset.build_interactions([(0, items + 1)]) dataset.fit_partial([users + 1], [items + 1]) dataset.build_interactions([(users + 1, 0)]) dataset.build_interactions([(0, items + 1)])