class DataFit: def __init__(self): self.dataset = None def fit(self): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() self.dataset = Dataset() self.dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() interactions, weights = self.dataset.build_interactions(rating_list) book_features = DataPrep.create_features() books_features = self.dataset.build_item_features(book_features) return interactions, weights, books_features def create_new_interactions(self, checkpoint): rating_list = DataPrep.get_rating_list_from_checkpoint(checkpoint) interactions, weights = self.dataset.build_interactions(rating_list) return interactions, weights def get_user_mapping(self): user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping( ) return user_id_map def get_book_mapping(self): user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping( ) return item_id_map @staticmethod def fit_evaluate(test_percentage=0.1): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() dataset = Dataset() dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() random.shuffle(rating_list) rating_list_test = rating_list[:int(test_percentage * len(rating_list))] rating_list_train = rating_list[int(test_percentage * len(rating_list)):] interactions_train, weights_train = dataset.build_interactions( rating_list_train) interactions_test, weights_test = dataset.build_interactions( rating_list_test) return interactions_train, weights_train, interactions_test, weights_test
def obtener_matrices(self): """ Método obtener_matrices. Obtiene las matrices necesarias para la creación de los modelos de LightFM. Este método solo se utiliza en la interfaz de texto. """ global train, test, modelo, item_features, user_features # Se obtienen los dataframes Entrada.obtener_datos() ratings_df = Entrada.ratings_df users_df = Entrada.users_df items_df = Entrada.items_df # Se transforman los dataframes en matrices que puedan ser utilzadas por los modelos dataset = Dataset() dataset.fit(users_df[users_df.columns.values[0]], items_df[items_df.columns.values[0]], user_features=users_df[users_df.columns.values[1]], item_features=items_df[items_df.columns.values[1]]) # Si el modelo es colaborativo o híbrido se tienen en cuenta las valoraciones de los usuarios if self.opcion_modelo == 1 or self.opcion_modelo == 2: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]], row[ratings_df.columns.values[2]]) for index, row in ratings_df.iterrows()) else: (interacciones, pesos) = dataset.build_interactions( (row[ratings_df.columns.values[0]], row[ratings_df.columns.values[1]]) for index, row in ratings_df.iterrows()) # Se obtienen las matrices de features y se guardan item_features = dataset.build_item_features( (row[items_df.columns.values[0]], [row[items_df.columns.values[1]]]) for index, row in items_df.iterrows()) user_features = dataset.build_user_features( (row[users_df.columns.values[0]], [row[users_df.columns.values[1]]]) for index, row in users_df.iterrows()) print("Guarda la matriz de item features") guardar_datos_pickle(item_features, 'la matriz de item features') print("Guarda la matriz de user features") guardar_datos_pickle(user_features, 'la matriz de user feautures') # Se dividen las interacciones en conjuntos de entrenamiento y test y se guardan train, test = random_train_test_split(interacciones, test_percentage=0.2) print("Guarda la matriz de entrenamiento") guardar_datos_pickle(train, 'la matriz de entrenamiento') print("Guarda la matriz de test") guardar_datos_pickle(test, 'la matriz de test')
def evaluate_model(df, user_id_col='user_id', item_id_col='business_id', stratify=None): """ Model evaluation. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. stratify: if use stratification. Returns: train_auc: training set auc score. test_auc: testing set auc score. """ # model evaluation # create test and train datasets print('model evaluation') train, test = train_test_split(df, test_size=0.2, stratify=stratify) ds = Dataset() # we call fit to supply userid, item id and user/item features ds.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items ) # plugging in the interactions (train_interactions, train_weights) = ds.build_interactions([ (x[0], x[1], x[2]) for x in train.values ]) (test_interactions, _) = ds.build_interactions([(x[0], x[1], x[2]) for x in test.values]) # model model = LightFM(no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model.fit(train_interactions, sample_weight=train_weights, epochs=10, num_threads=10) # auc-roc train_auc = auc_score(model, train_interactions, num_threads=20).mean() print('Training set AUC: %s' % train_auc) test_auc = auc_score(model, test_interactions, num_threads=20).mean() print('Testing set AUC: %s' % test_auc)
def interactions(self): # If interactions have not been supplied, process the file provided in source # N.B. This property also sets weights, which is probably not a best practice if self._interactions is None: if self._category == 'ratings_matrix': rm_df = pd.read_csv(self.path) ids = rm_df['sub'] rm_df = rm_df.set_index(keys='sub') if 'Unnamed: 0' in rm_df.columns: rm_df.drop('Unnamed: 0', axis=1, inplace=True) dataset = Dataset() dataset.fit(list(ids), list(rm_df.columns)) self.mapping = dataset.mapping() interactions = [] for item in rm_df.columns.tolist(): users = rm_df.index[rm_df[item] >= 1].tolist() counts = rm_df[item][rm_df[item] >= 1] interactions.extend( zip(users, itertools.repeat(item, len(users)), counts)) (self._interactions, self._weights) = dataset.build_interactions(interactions) else: int_df = pd.read_csv(self.path) if 'Unnamed: 0' in int_df.columns: int_df.drop('Unnamed: 0', axis=1, inplace=True) int_df = int_df.groupby(['subscriber_id', 'ddi_block_id']).size().reset_index()\ .rename(columns={0:'count'}) dataset = Dataset() ids = int_df['subscriber_id'].unique() items = int_df['ddi_block_id'].unique() dataset.fit(list(ids), list(items)) self.mapping = dataset.mapping() if self._use_weights: interactions = zip(int_df['subscriber_id'], int_df['ddi_block_id'], int_df['count']) else: interactions = zip(int_df['subscriber_id'], int_df['ddi_block_id']) (self._interactions, self._weights) = dataset.build_interactions(interactions) else: return self._interactions
def peuimportelenom(): noms= request.form.getlist("dblst_artists") sugg= [] #print(noms) for el in noms: artiste= ap[ap.name== el] lind= list(artiste.artistID)[0] -1 vecteur[lind]= artiste.playCountScaled.median() # création de la matrice X= np.vstack((ratings,vecteur)) # On importe le code du jupyter notebook n_users, n_items = X.shape Xcsr = csr_matrix(X) Xcoo = Xcsr.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) scores = model.predict(0, vecteur) top_items = ap["name"].unique()[np.argsort(-scores)] sugg= top_items[:10] return render_template("page.html", artist_names= artist_names, noms= noms, sugg= sugg)
def create_dataset(df, item_features, list_item_features): """ function to create the dataset based on df which stores all the data including features (tags) of each products Args: df(pandas dataframe) - """ ## create a mapping between the user and item ids from our input data #to indices that will be used internally by the model dataset = Dataset(item_identity_features=True) list_user_names = list(df.index) list_items = df.columns.values dataset.fit( (user_name for user_name in list_user_names), (item for item in list_items), item_features=(item_feature for item_feature in list_item_features)) ## Build the interaction matrix # it encodes the interactions betwee users and items. # need (user, item) pair that has 1's in df list_pairs = list(df.stack().index) (interactions, weights) = dataset.build_interactions( (pair for pair in list_pairs)) item_feature_matrix = dataset.build_item_features(item_features) return dataset, interactions, weights, item_feature_matrix
def interactions(df): movie_genre = [x.split("|") for x in df["genre"]] all_movie_genre = sorted( list(set(itertools.chain.from_iterable(movie_genre)))) all_occupations = sorted(list(set(df["occupation"]))) dataset = Dataset() dataset.fit( df["userID"], df["itemID"], item_features=all_movie_genre, user_features=all_occupations, ) item_features = dataset.build_item_features( (x, y) for x, y in zip(df.itemID, movie_genre)) user_features = dataset.build_user_features( (x, [y]) for x, y in zip(df.userID, df["occupation"])) (interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values) train_interactions, test_interactions = cross_validation.random_train_test_split( interactions, test_percentage=TEST_PERCENTAGE, random_state=np.random.RandomState(SEEDNO), ) return train_interactions, test_interactions, item_features, user_features
def prepareData(df, tags): df = df[df.actionCategory == "WebNei clicked"] actionByUsers = df.groupby(["userName", "actionName"]).size() uniqueUsers = df[df.userName.isin( actionByUsers.index.get_level_values( 0).unique().values)].drop_duplicates('userName') uniqueUsers['user_features'] = uniqueUsers[[ 'title', 'team', 'organization', 'department' ]].values.tolist() dataset = Dataset() dataset.fit((list(actionByUsers.index.get_level_values(0))), (list(actionByUsers.index.get_level_values(1)))) rowM, colM = prepareJson(tags) rowU, colU = prepareUserFeatures(uniqueUsers) dataset.fit_partial(items=rowM, item_features=colM, users=rowU, user_features=colU) (interactions, weights) = dataset.build_interactions( zip(list(actionByUsers.index.get_level_values(0)), list(actionByUsers.index.get_level_values(1)))) item_features = dataset.build_item_features(zip(rowM, [colM])) user_features = dataset.build_user_features(zip(rowU, [colU])) return interactions, item_features, user_features
def train_model(df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items ) (interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM(no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit(interactions, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, _, business_id_map, _ = ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, item_dict
def fit_evaluate(test_percentage=0.1): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() dataset = Dataset() dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() random.shuffle(rating_list) rating_list_test = rating_list[:int(test_percentage * len(rating_list))] rating_list_train = rating_list[int(test_percentage * len(rating_list)):] interactions_train, weights_train = dataset.build_interactions( rating_list_train) interactions_test, weights_test = dataset.build_interactions( rating_list_test) return interactions_train, weights_train, interactions_test, weights_test
def test_fitting_no_identity(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, 0) assert dataset.item_features_shape() == (items, 0) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([], normalize=False).getnnz() == 0 assert dataset.build_item_features([], normalize=False).getnnz() == 0
def create_recommender(): # obtain interaction table from dynamodb, which is json data dynamodb = boto3.resource('dynamodb') table = dynamodb.Table('eye_video_vote') response = table.scan() raw_data = response['Items'] #transform json structure data to user-item-rating interaction format final_df = pd.DataFrame(columns=['userId','videoId','rating']) for i in raw_data: # data = raw_data[i] if any('upVote' in s for s in list(i.keys())): df1 = {k:i[k] for k in ('upVote','videoId')} df1['videoId'] = {df1['videoId']} df1 = pd.DataFrame.from_dict(df1, orient='index').T df1['rating'] = randint(4, 5) df1.fillna(value = pd.np.nan, inplace=True) df1 = df1.fillna(method='ffill') df1.rename(columns={'upVote':'userId'},inplace=True) final_df = final_df.append(df1) if any('downVote' in s for s in list(i.keys())): df2 = {k:i[k] for k in ('downVote','videoId')} df2['videoId'] = {df2['videoId']} df2 = pd.DataFrame.from_dict(df2, orient='index').T df2['rating'] = randint(1, 2) df2.fillna(value = pd.np.nan, inplace=True) df2 = df2.fillna(method='ffill') df2.rename(columns={'downVote':'userId'},inplace=True) final_df = final_df.append(df2) #rename the columns final_df.rename(columns={'userId':'UserID', 'videoId':'MovieID', 'rating':'rating'}, inplace=True) #generate the appropriate lightfm dataset dataset = Dataset() dataset.fit(users = (row['UserID'] for index,row in final_df.iterrows()), items = (row['MovieID'] for index,row in final_df.iterrows())) (interactions, weights) = dataset.build_interactions((row['UserID'],row['MovieID'],row['rating']) for index,row in final_df.iterrows()) #model collabrative filtering model_cf = LightFM(no_components=20, loss='warp') model_cf.fit(interactions, user_features=None, item_features=None, sample_weight=None, epochs=20, num_threads=4) with open('model_cf.pickle', 'wb') as fle: pickle.dump(model_cf, fle, protocol=pickle.HIGHEST_PROTOCOL) return
def test_fitting(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, users) assert dataset.item_features_shape() == (items, items) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([]).getnnz() == users assert dataset.build_item_features([]).getnnz() == items
def test_fitting_no_identity(): users, items = 10, 100 dataset = Dataset(user_identity_features=False, item_identity_features=False) dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, 0) assert dataset.item_features_shape() == (items, 0) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([], normalize=False).getnnz() == 0 assert dataset.build_item_features([], normalize=False).getnnz() == 0
def test_fitting(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) assert dataset.interactions_shape() == (users, items) assert dataset.user_features_shape() == (users, users) assert dataset.item_features_shape() == (items, items) assert dataset.build_interactions([])[0].shape == (users, items) assert dataset.build_user_features([]).getnnz() == users assert dataset.build_item_features([]).getnnz() == items
def build_lightfm_dataset(self) -> None: """ Builds final datasets for user-variant and variant-variant recommendations. """ logging.info("Creating LightFM matrices...") lightfm_dataset = LFMDataset() ratings_list = self.interaction_list logging.info('#'*60) lightfm_dataset.fit_partial( (rating['user_id'] for rating in ratings_list), (rating['product_id'] for rating in ratings_list) ) item_feature_names = self.item_df.columns logging.info(f'Logging item_feature_names - with product_id: \n{item_feature_names}') item_feature_names = item_feature_names[~item_feature_names.isin(['product_id'])] logging.info(f'Logging item_feature_names - without product_id: \n{item_feature_names}') for item_feature_name in item_feature_names: lightfm_dataset.fit_partial( items=(item['product_id'] for item in self.item_list), item_features=((item[item_feature_name] for item in self.item_list)), ) item_features_data = [] for item in self.item_list: item_features_data.append( ( item['product_id'], [ item['product_name'], item['aisle'], item['department'] ], ) ) logging.info(f'Logging item_features_data @build_lightfm_dataset: \n{item_features_data}') self.item_features = lightfm_dataset.build_item_features(item_features_data) self.interactions, self.weights = lightfm_dataset.build_interactions( ((rating['user_id'], rating['product_id']) for rating in ratings_list) ) self.n_users, self.n_items = self.interactions.shape logging.info(f'Logging self.interactions @build_lightfm_dataset: \n{self.interactions}') logging.info(f'Logging self.weights @build_lightfm_dataset: \n{self.weights}') logging.info( f'The shape of self.interactions {self.interactions.shape} ' f'and self.weights {self.weights.shape} represent the user-item matrix.')
def predict_artist_list(artist_select): # Build a user-artist rating matrix ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled') ratings = ratings_df.fillna(0).values artist_names = ap.sort_values("artistID")["name"].unique() add_user = [0]*17632 new_list = [] for item in artist_select: artists_idx = artists.index[artists["name"] == item] new_list.append(artists_idx) for i in new_list : for j in i : index = j add_user[index] = 1 new_ratings_df = np.vstack((ratings_df, add_user)) ratings_df = pd.DataFrame(new_ratings_df) new_userID = (ratings_df.shape[0] - 1) ratings = ratings_df.fillna(0).values # Build a sparse matrix X = csr_matrix(ratings) n_users, n_items = ratings_df.shape user_ids = ratings_df.index.values artist_names = ap.sort_values("artistID")["name"].unique() # Build data references + train test Xcoo = X.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) # Predict scores = model.predict(0, np.arange(n_items)) top_items = artist_names[np.argsort(-scores)] return top_items[0:10]
def create_dataset(df): ## create a mapping between the user and item ids from our input data #to indices that will be used internally by the model dataset = Dataset() list_user_names = list(df.index) list_items = df.columns.values dataset.fit((user_name for user_name in list_user_names), (item for item in list_items)) ## Build the interaction matrix # it encodes the interactions betwee users and items. # need (user, item) pair that has 1's in df list_pairs = list(df.stack().index) (interactions, weights) = dataset.build_interactions( (pair for pair in list_pairs)) return dataset, interactions, weights
def lightfm_trainer(train: np.ndarray, loss: str, n_components: int, lam: float) -> None: """Train lightfm models.""" model = LightFM( loss=loss, user_alpha=lam, item_alpha=lam, no_components=n_components, learning_rate=0.001, random_state=12345, ) dataset = Dataset() dataset.fit(train[:, 0], train[:, 1]) (interactions, weights) = dataset.build_interactions( ((x[0], x[1], 1) for x in train[train[:, 2] == 1])) model.fit(interactions, epochs=100) return model
def fit_data(self, matrix, user_features=None, item_features=None): """ Create datasets for .fit() method. Args: matrix: User-item interactions matrix (weighted) user_features: User-features pandas dataframe which index contains user_ids (crd_no) item_features: Item-features pandas dataframe which index contains good_ids (plu_id) Returns: Model with fitted (mapped) datasets """ matrix.sort_index(inplace=True) matrix.sort_index(inplace=True, axis=1) dataset = Dataset() dataset.fit((x for x in matrix.index), (x for x in matrix.columns)) interactions = pd.melt( matrix.replace(0, np.nan).reset_index(), id_vars='index', value_vars=list(matrix.columns[1:]), var_name='plu_id', value_name='rating').dropna().sort_values('index') interactions.columns = ['crd_no', 'plu_id', 'rating'] self.interactions, self.weights = dataset.build_interactions( [tuple(x) for x in interactions.values]) if user_features is not None: user_features.sort_index(inplace=True) dataset.fit_partial(users=user_features.index, user_features=user_features) self.user_features = dataset.build_user_features( ((index, dict(row)) for index, row in user_features.iterrows())) else: self.user_features = None if item_features is not None: item_features.sort_index(inplace=True) dataset.fit_partial(items=item_features.index, item_features=item_features) self.item_features = dataset.build_item_features( ((index, dict(row)) for index, row in item_features.iterrows())) else: self.item_features = None
def load_parameter(): ratings = get_ratings() books = get_books() users = get_users() books_pd = convert_pd(books) id_users_books = StoreValue() for x in ratings: id_users_books._user_id.append(x[0]) id_users_books._book_id.append(x[1]) # Được tạo ra theo hướng dẫn tại https://making.lyst.com/lightfm/docs/examples/dataset.html dataset_explicit = Dataset() dataset_explicit.fit(id_users_books._user_id, id_users_books._book_id) num_users, num_items = dataset_explicit.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) dataset_explicit.fit_partial(items=(x[0] for x in books), item_features=(x[7] for x in books)) dataset_explicit.fit_partial(users=(x[0] for x in users)) # create ---> mapping # interactions: dưới dạng COO_maxtrix, các tương tác sẽ là user_id và book_id # Trọng số voting (interactions_explicit, weights_explicit) = dataset_explicit.build_interactions((id_users_books._user_id[i], id_users_books._book_id[i]) for i in range(len(ratings))) # Đây là đặc trưng trích xuất từ các items (sách) dựa trên tác giả của cuốn sách được cung cấp item_features = dataset_explicit.build_item_features(((x[0], [x[7]]) for x in books)) # user_features = dataset_explicit.build_user_features(((x[0], [x[1]]) for x in users)) model_explicit_ratings = LightFM_ext(loss='warp') (train, test) = random_train_test_split(interactions=interactions_explicit, test_percentage=0.02) model_explicit_ratings.fit(train, item_features=item_features, epochs=2, num_threads=4) return model_explicit_ratings, dataset_explicit, interactions_explicit, weights_explicit, item_features, books_pd
def fetch_data(): # Create a SQL connection to our SQLite database con = sqlite3.connect("db.sqlite3") cur = con.cursor() # The result of a "cursor.execute" can be iterated over by row data = [] users = [] movies = [] for row in cur.execute('SELECT id FROM RecoFramework_userinfo;'): users.append(row[0]) for row in cur.execute('SELECT movieId FROM RecoFramework_movies;'): movies.append(row[0]) for row in cur.execute( 'SELECT userId, movieId, rating FROM RecoFramework_ratings WHERE rating = 5;' ): data.append(row) dataset = Dataset() #print("Loading dataset...") dataset.fit(users, movies) interactions, ratings = dataset.build_interactions(data) # Be sure to close the connection con.close() train, test = random_train_test_split(interactions) model = LightFM(loss='warp') # train lightFM model using fit method #print("Starting training the model...") model.fit(train, epochs=30, num_threads=2) user_dict = dataset._user_id_mapping movie_dict = dataset._item_id_mapping return model, ratings, user_dict, movie_dict, train, test
def create_datasets(cluster_id): events_list = get_events_from_es(cluster_id) dataframe_interactions, dataframe_users_features, dataframe_item_features, user_tuple, item_tuple = create_interactions_and_features(events_list, cluster_id) print(dataframe_interactions, cluster_id, file=sys.stderr) print(dataframe_users_features, cluster_id, file=sys.stderr) print(dataframe_item_features, cluster_id, file=sys.stderr) #print(user_tuple) # print(item_tuple) user_features = format_users_features(dataframe_users_features) #print(user_features) item_features = format_items_features(dataframe_item_features) #print(item_features) dataset = Dataset() dataset.fit( dataframe_interactions['user'].unique(), # all the users dataframe_interactions['item'].unique(), # all the items user_features = user_features, item_features = item_features ) (interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in dataframe_interactions.values ]) # print(interactions) # print(weights) final_user_features = dataset.build_user_features(user_tuple, normalize= False) final_item_features = dataset.build_item_features(item_tuple, normalize= False) return dataset, interactions, weights, final_item_features, final_user_features
def test_exceptions(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) with pytest.raises(ValueError): dataset.build_interactions([(users + 1, 0)]) with pytest.raises(ValueError): dataset.build_interactions([(0, items + 1)]) dataset.fit_partial([users + 1], [items + 1]) dataset.build_interactions([(users + 1, 0)]) dataset.build_interactions([(0, items + 1)])
def test_exceptions(): users, items = 10, 100 dataset = Dataset() dataset.fit(range(users), range(items)) with pytest.raises(ValueError): dataset.build_interactions([(users + 1, 0)]) with pytest.raises(ValueError): dataset.build_interactions([(0, items + 1)]) dataset.fit_partial([users + 1], [items + 1]) dataset.build_interactions([(users + 1, 0)]) dataset.build_interactions([(0, items + 1)])
def train_model(): dataset = Dataset() dataset.fit((x['User_ID'] for x in get_ratings()), (x['Item_ID'] for x in get_ratings())) for i in range(25): add_item_features(dataset, paan_features[i]) (interactions, weights) = dataset.build_interactions( ((x['User_ID'], x['Item_ID']) for x in get_ratings())) item_features = dataset.build_item_features(((x['Item_ID'], [ x['Banaras'], x['Calcutta'], x['Maghai'], x['Sada'], x['Meetha'], x['Chocolate'], x['Dry Fruit'], x['Mango'], x['Strawberry'], x['Pineapple'], x['Kaju'], x['Jelly'], x['Rose'], x['Shahi'], x['Kesar'], x['Vanilla'], x['Masala'], x['Khatta'], x['Orange'], x['White'], x['Silver'], x['RaatRani'], x['Nutella'], x['Special'], x['Gold'] ]) for x in get_item_features())) model = LightFM(loss='bpr') model.fit(interactions, item_features=item_features) labels = np.array([x['Item_ID'] for x in get_item_features()]) print("Model Trained Successfully.....") return model, interactions, labels, item_features
User.extract_user_ids(user_stats)), True) business_features = dataset.build_item_features( Business.build_business_features( business_stats, Business.extract_business_ids(business_stats)), True) print('[ %04ds ] Dataset initialized' % (time.time() - start_time)) user_avg, user_std = Review.extract_user_average_and_std(training_set) normalized_training_reviews = Review.normalize_by_user( training_set, user_avg) training_interactions = Review.extract_sparse_interaction_matrix( normalized_training_reviews) interaction_matrix, interaction_weight = dataset.build_interactions( training_interactions) print('[ %04ds ] Interactions built' % (time.time() - start_time)) no_components = 50 loss = 'bpr' learning_rate = 0.1 item_alpha = 1e-5 user_alpha = 1e-5 epochs = 20 model = LightFM(no_components=no_components, loss=loss, learning_rate=learning_rate, item_alpha=item_alpha, user_alpha=user_alpha)
def run(self, epochs: int = 1, no_components: int = 50, learning_rate: float = 0.05) -> Dict[str, float]: """ build interaction matrix -> build movie features -> build model Example (5000 samples, 50 components, 5 epochs, learning_rate=0.05) ================================= {'auc_train': 0.66268414, 'auc_test': 0.67257625, 'precision_train@10': 0.035984848, 'precision_test@10': 0.014193548, 'recall_train@10': 0.06827082513973247, 'recall_test@10': 0.0646373101211811} ########################### #### Random Stratified #### ########################### Example (2 million samples, 50 components, 1 epochs, learning_rate=0.05) ================================= {'auc_train': 0.5171841, 'auc_test': 0.51610065, 'precision_train@10': 0.018248174, 'precision_test@10': 0.0040145987, 'recall_train@10': 0.0008001067196610589, 'recall_t0.018248174est@10': 0.0007001527280332769} ######################## #### Popular Active #### ######################## Example (333000 samples, 150 components, 1 epochs, learning_rate=0.05) 20% test data ================================= {'auc_train': 0.63388383, 'auc_test': 0.5569484, 'precision_train@10': 0.7255412, 'precision_test@10': 0.17099567, 'recall_train@10': 0.006322884137545113, 'recall_test@10': 0.006053869700910709} Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05) 40% test data ================================= {'auc_train': 0.6001097, 'auc_test': 0.56429684, 'precision_train@10': 0.56060606, 'precision_test@10': 0.33030304, 'recall_train@10': 0.006517918240037026, 'recall_test@10': 0.005792534657980192} Example (333000 samples, 50 components, 20 epochs, learning_rate=0.05) 40% test data ================================= {'auc_train': 0.6077434, 'auc_test': 0.5688331, 'precision_train@10': 0.5874459, 'precision_test@10': 0.32424247, 'recall_train@10': 0.0068082500065638684, 'recall_test@10': 0.005756504594433489} Example (333000 samples, 50 components, 1 epochs, learning_rate=0.05) 40% test data with normalization ================================= {'auc_train': 0.60080063, 'auc_test': 0.56425303, 'precision_train@10': 0.56926405, 'precision_test@10': 0.33679655, 'recall_train@10': 0.006628036812872702, 'recall_test@10': 0.005913302996971047} """ ## Build Matrix Factorization between Customer and Movie data = self._filter_data dataset = Dataset() dataset.fit(data['Cust_Id'].unique(), data['Movie_Id'].unique(), item_features=self.get_combination) (interactions, weights) = dataset.build_interactions([ (x['Cust_Id'], x['Movie_Id'], x['Rating']) for index, x in data.iterrows() ]) train, test = random_train_test_split( interactions, test_percentage=0.4, random_state=np.random.RandomState(7)) print("Finished creating interactions matrix!") ## Build movie features movies_id, tfidf_data = self.get_tfidf_matrix features_lists = [list(x) for x in tfidf_data.values] movies_features = dataset.build_item_features( data=self.get_movies_tuple(features_lists, movies_id, tfidf_data), normalize=True) print("Finished building movie features!") ## Build model model = LightFM(no_components=no_components, learning_rate=learning_rate, loss='warp', k=15) model.fit(train, epochs=epochs, item_features=movies_features, num_threads=4) print("Finished building LightFM model!") with open('hybrid_model_popular_active.pickle', 'wb') as fle: pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL) print("Finished saving LightFM model!") return { "auc_train": auc_score(model, train, item_features=movies_features).mean(), "auc_test": auc_score(model, test, item_features=movies_features).mean(), "precision_train@10": precision_at_k(model, train, item_features=movies_features, k=10).mean(), "precision_test@10": precision_at_k(model, test, item_features=movies_features, k=10).mean(), "recall_train@10": recall_at_k(model, train, item_features=movies_features, k=10).mean(), "recall_test@10": recall_at_k(model, test, item_features=movies_features, k=10).mean() }
def train_model( df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """ Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. user_feature_map: the feature map of users business_feature_map: the feature map of items """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features user_cols = ['user_id', 'average_stars'] categories = [c for c in df.columns if c[0].isupper()] item_cols = ['business_id', 'state'] for i in df.columns[10:]: item_cols.append(str(i)) user_features = user_cols[1:] item_features = item_cols[2:] ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items user_features=user_features, # additional user features item_features=item_features ) df_users = df.drop_duplicates(user_id_col) # df_users = df[df.duplicated(user_id_col) == False] users_features = [] for i in range(len(df_users)): users_features.append(get_users_features_tuple(df_users.values[i])) users_features = ds_full.build_user_features( users_features, normalize=False) items = df.drop_duplicates(item_id_col) # items = df[df.duplicated(item_id_col) == False] items_features = [] for i in range(len(items)): items_features.append(get_items_features_tuple( items.values[i], categories)) items_features = ds_full.build_item_features( items_features, normalize=False) (interactions, weights) = ds_full.build_interactions( [(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM( no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit( interactions, user_features=users_features, item_features=items_features, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, user_feature_map, business_id_map, business_feature_map = \ ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, \ item_dict, user_feature_map, business_feature_map
def evaluate_model( df, user_id_col='user_id', item_id_col='business_id', stratify=None): """ Model evaluation. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. stratify: if use stratification. No return value """ # create test and train datasets print('model evaluation') train, test = train_test_split(df, test_size=0.2, stratify=stratify) ds = Dataset() # we call fit to supply userid, item id and user/item features user_cols = ['user_id', 'average_stars'] categories = [c for c in df.columns if c[0].isupper()] item_cols = ['business_id', 'state'] for i in df.columns[10:]: item_cols.append(str(i)) user_features = user_cols[1:] item_features = item_cols[2:] ds.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items user_features=user_features, # additional user features item_features=item_features ) train_users = train.drop_duplicates('user_id') # train_users = train[train.duplicated('user_id') == False] train_user_features = [] for i in range(len(train_users)): train_user_features.append(get_users_features_tuple( train_users.values[i])) train_user_features = ds.build_user_features( train_user_features, normalize=False) test_users = test.drop_duplicates('user_id') # test_users = test[test.duplicated('user_id') == False] test_user1_features = [] for i in range(len(test_users)): test_user1_features.append(get_users_features_tuple( test_users.values[i])) test_user_features = ds.build_user_features( test_user1_features, normalize=False) train_items = train.drop_duplicates('business_id') # train_items = train[train.duplicated('business_id') == False] train_item1_features = [] for i in range(len(train_items)): train_item1_features.append(get_items_features_tuple( train_items.values[i], categories)) train_item_features = ds.build_item_features( train_item1_features, normalize=False) test_items = test.drop_duplicates('business_id') # test_items = test[test.duplicated('business_id') == False] test_item_features = [] for i in range(len(test_items)): test_item_features.append(get_items_features_tuple( test_items.values[i], categories)) test_item_features = ds.build_item_features( test_item_features, normalize=False) # plugging in the interactions and their weights (train_interactions, train_weights) = ds.build_interactions( [(x[0], x[1], x[2]) for x in train.values]) (test_interactions, test_weights) = ds.build_interactions( [(x[0], x[1], x[2]) for x in test.values]) # model model = LightFM( no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model.fit( train_interactions, user_features=train_user_features, item_features=train_item_features, sample_weight=train_weights, epochs=10, num_threads=10) # auc-roc train_auc = auc_score( model, train_interactions, user_features=train_user_features, item_features=train_item_features, num_threads=20).mean() print('Training set AUC: %s' % train_auc) test_auc = auc_score( model, test_interactions, user_features=test_user_features, item_features=test_item_features, num_threads=20).mean() print('Testing set AUC: %s' % test_auc)
def lambda_handler(event, context): try: ## Fetch data from RDS code connection = pymysql.connect( host='fitbookdb.crm91a2epcbi.us-east-1.rds.amazonaws.com', user='******', passwd='postgres', db='fitbookdb', cursorclass=pymysql.cursors.DictCursor) print("Connection successful") except: print("Connection error") # In[3]: #Get Food DataFrame dict_list = [] with connection.cursor() as cur: cur.execute("select * from food_dataset") for row in cur: dict_list.append(row) food_rds_df = pd.DataFrame(dict_list) food_df = food_rds_df.copy() food_df.drop([ 'Portion_Default', 'Portion_Amount', 'Factor', 'Increment', 'Multiplier', 'Portion_Display_Name', 'Food_Code', 'Display_Name' ], axis=1, inplace=True) # food_df.head() print('Food Dataframe imported') # In[4]: # # TODO: Perform Binning # food_30_bins = ['Alcohol', 'Calories', 'Saturated_Fats'] # for each_column in food_30_bins: # bins = np.linspace(food_df[each_column].min(), food_df[each_column].max(), 30) # food_df[each_column+'bin'] = pd.cut(food_df[each_column], bins, labels=np.arange(0,len(bins)-1)) # food_df # In[5]: # for each_column in food_30_bins: # print(food_df[each_column].min()) # In[6]: #Get User Dataframe # user_df = pd.read_csv('user_db_try.csv') # user_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserData") for row in cur: dict_list.append(row) user_rds_df = pd.DataFrame(dict_list) user_df = user_rds_df.copy() user_df.drop([ 'cognitoAccessToken', 'cognitoIDToken', 'cognitoRefreshToken', 'fitbitAccessToken', 'fitbitUserID', 'userName' ], axis=1, inplace=True) # user_df.head() print('User Dataframe imported') # In[7]: #Get userItem DataFrame # userItem_df = pd.read_csv('userItem_db_try_new.csv') # userItem_df.head() dict_list = [] with connection.cursor() as cur: cur.execute("select * from tblUserRating") for row in cur: dict_list.append(row) userItem_rds_df = pd.DataFrame(dict_list) userItem_df = userItem_rds_df.copy() # userItem_df.head() print('UserItem Dataframe imported') # In[8]: #Make all the feature values unique for column_name in food_df.columns: if column_name != 'food_ID': food_df[column_name] = str( column_name) + ":" + food_df[column_name].astype(str) # food_df.head() # In[9]: #This Dict will be useful while creating tupples food_features_df = food_df.drop(['food_ID'], axis=1).copy() food_features_dict = food_features_df.to_dict('split') # food_features_dict # In[10]: food_feature_values = [] for column_name in food_features_df.columns: food_feature_values.extend(food_features_df[column_name].unique()) # food_feature_values # In[11]: for column_name in user_df.columns: if column_name != 'userID': user_df[column_name] = str( column_name) + ":" + user_df[column_name].astype(str) user_features_df = user_df.drop(['userID'], axis=1).copy() user_features_dict = user_features_df.to_dict('split') # user_features_dict # In[12]: user_feature_values = [] for column_name in user_features_df.columns: user_feature_values.extend(user_features_df[column_name].unique()) # user_feature_values # In[13]: user_tuples = [] food_tuples = [] for index, row in user_df.iterrows(): user_tuples.append((row['userID'], user_features_dict['data'][index])) for index, row in food_df.iterrows(): food_tuples.append((row['food_ID'], food_features_dict['data'][index])) # food_tuples # In[14]: print("Creating LightFm dataset") dataset = Dataset() dataset.fit(users=(user_id for user_id in user_df['userID']), items=(food_id for food_id in food_df['food_ID'])) print("Dataset Created") # In[15]: num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) # In[16]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((each_feature for each_feature in food_features)for food_features in food_features_dict['data'])) # In[17]: # dataset.fit_partial(items=(food_id for food_id in food_df['Food_Code']), # item_features=((row['Milk'], row['Meats'], row['Alcohol'], row['Calories'])for index,row in food_df.iterrows())) # In[18]: print("fittng item partial features") dataset.fit_partial(items=(food_id for food_id in food_df['food_ID']), item_features=(each_value for each_value in food_feature_values)) # In[19]: # dataset.fit_partial(users=(user_id for user_id in user_df['Id']), # user_features=((each_feature for each_feature in user_features)for user_features in user_features_dict['data'])) # In[20]: print("fittng user partial features") dataset.fit_partial(users=(user_id for user_id in user_df['userID']), user_features=(each_value for each_value in user_feature_values)) # In[21]: # dataset.item_features_shape() # dataset.user_features_shape() # In[22]: print("Building Interactions") (interactions, weights) = dataset.build_interactions( ((x['userID'], x['food_ID'], x['rating']) for y, x in userItem_df.iterrows())) # print(repr(interactions)) # print(weights) # In[23]: # interactions.shape # In[24]: print("Building item features") item_features = dataset.build_item_features(each_tuple for each_tuple in food_tuples) # print(item_features) # In[25]: user_features = dataset.build_user_features(each_tuple for each_tuple in user_tuples) # print(user_features) # In[26]: print("Fitting Model") model = LightFM(loss='warp') model.fit(interactions, item_features=item_features, user_features=user_features) print("Model trained!!") print("Pickle started!!") pickle.dump(model, open("/tmp/model.pkl", 'wb'), protocol=2) bucketName = "fitbook-lambda-packages" Key = "/tmp/model.pkl" outPutname = "model.pkl" print("Uploading to S3") s3 = boto3.client('s3') s3.upload_file(Key, bucketName, outPutname) print("Upload done") os.remove("/tmp/model.pkl") print("Pickle file deleted") print("Successssss!!!!!")
(x['ISBN'] for x in get_ratings())) # query the dataset to check how many users and items (i.e. books) it knows num_users, num_items = dataset.interactions_shape() print('Num users : {}, num_items {}.'.format(num_users, num_items)) # add some item feature mappings, and creates a unique feature for each author # NOTE: more item ids are fitted than usual, to make sure our mappings are complete # even if there are items in the features dataset that are not in the interaction set dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()), item_features=(x['Book-Author'] for x in get_book_features())) # build the interaction matrix which is a main input to the LightFM model # it encodes the interactions between the users and the items (interactions, weights) = dataset.build_interactions( ((x['User-ID'], x['ISBN']) for x in get_ratings())) # item_features matrix can also be created item_features = dataset.build_item_features( ((x['ISBN'], [x['Book-Author']]) for x in get_book_features())) # split the current dataset into a training and test dataset train, test = random_train_test_split(interactions, test_percentage=0.01, random_state=None) # build the model using the training dataset, notice the use of item_features as well, # this is a hybrid model model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS)
def calc(request): try : stores = Store.objects.all(); reviews = Review.objects.all(); stores = pd.DataFrame(list(stores.values('id', 'store_id','store_name', 'category', 'address','latitude','longitude','average_rating'))) reviews = pd.DataFrame(list(reviews.values('id', 'storeid','userid', 'score','reg_time'))) reviews_source = [(reviews['userid'][i], reviews['storeid'][i]) for i in range(reviews.shape[0])] item_feature_source = [(stores['store_id'][i], [ stores['category'][i],stores['address'][i],stores['latitude'][i],stores['longitude'][i], stores['average_rating'][i]] ) for i in range(stores.shape[0]) ] dataset = Dataset() dataset.fit(users=reviews['userid'].unique(), items=reviews['storeid'].unique(), item_features=stores[stores.columns[1:]].values.flatten()) interactions, weights = dataset.build_interactions(reviews_source) item_features = dataset.build_item_features(item_feature_source) # Split Train, Test data train, test = random_train_test_split(interactions, test_percentage=0.1) train, test = train.tocsr().tocoo(), test.tocsr().tocoo() train_weights = train.multiply(weights).tocoo() # Define Search Space trials = Trials() space = [hp.choice('no_components', range(10, 50, 10)), hp.uniform('learning_rate', 0.01, 0.05)] # Define Objective Function def objective(params): no_components, learning_rate = params global model model = LightFM(no_components=no_components, learning_schedule='adagrad', loss='warp', learning_rate=learning_rate, random_state=0) model.fit(interactions=train, item_features=item_features, sample_weight=train_weights, epochs=3, verbose=False) test_precision = precision_at_k(model, test, k=5, item_features=item_features).mean() print("no_comp: {}, lrn_rate: {:.5f}, precision: {:.5f}".format( no_components, learning_rate, test_precision)) # test_auc = auc_score(model, test, item_features=item_features).mean() output = -test_precision if np.abs(output+1) < 0.01 or output < -1.0: output = 0.0 return output # max_evals가 몇번 반복실행 할껀지. best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials) # 아이템피쳐 저장 with open('./saved_models/item_features.pickle', 'wb') as fle: pickle.dump(item_features, fle, protocol=pickle.HIGHEST_PROTOCOL) # 모델 저장해야 됨 with open('./saved_models/model.pickle', 'wb') as fle: pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL) item_biases, item_embeddings = model.get_item_representations(features=item_features) # item_embeddings 저장하기 with open('./saved_models/item_embeddings.pickle', 'wb') as fle: pickle.dump(item_embeddings, fle, protocol=pickle.HIGHEST_PROTOCOL) return Response({'result': True}) except : return Response({'result': False})