class DataFit: def __init__(self): self.dataset = None def fit(self): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() self.dataset = Dataset() self.dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() interactions, weights = self.dataset.build_interactions(rating_list) book_features = DataPrep.create_features() books_features = self.dataset.build_item_features(book_features) return interactions, weights, books_features def create_new_interactions(self, checkpoint): rating_list = DataPrep.get_rating_list_from_checkpoint(checkpoint) interactions, weights = self.dataset.build_interactions(rating_list) return interactions, weights def get_user_mapping(self): user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping( ) return user_id_map def get_book_mapping(self): user_id_map, user_feature_map, item_id_map, item_feature_map = self.dataset.mapping( ) return item_id_map @staticmethod def fit_evaluate(test_percentage=0.1): book_list = DataPrep.get_book_list() book_feature_list = DataPrep.get_feature_list() user_list = DataPrep.get_user_list() dataset = Dataset() dataset.fit(users=user_list, items=book_list, item_features=book_feature_list) rating_list = DataPrep.get_rating_list() random.shuffle(rating_list) rating_list_test = rating_list[:int(test_percentage * len(rating_list))] rating_list_train = rating_list[int(test_percentage * len(rating_list)):] interactions_train, weights_train = dataset.build_interactions( rating_list_train) interactions_test, weights_test = dataset.build_interactions( rating_list_test) return interactions_train, weights_train, interactions_test, weights_test
def interactions(self): # If interactions have not been supplied, process the file provided in source # N.B. This property also sets weights, which is probably not a best practice if self._interactions is None: if self._category == 'ratings_matrix': rm_df = pd.read_csv(self.path) ids = rm_df['sub'] rm_df = rm_df.set_index(keys='sub') if 'Unnamed: 0' in rm_df.columns: rm_df.drop('Unnamed: 0', axis=1, inplace=True) dataset = Dataset() dataset.fit(list(ids), list(rm_df.columns)) self.mapping = dataset.mapping() interactions = [] for item in rm_df.columns.tolist(): users = rm_df.index[rm_df[item] >= 1].tolist() counts = rm_df[item][rm_df[item] >= 1] interactions.extend( zip(users, itertools.repeat(item, len(users)), counts)) (self._interactions, self._weights) = dataset.build_interactions(interactions) else: int_df = pd.read_csv(self.path) if 'Unnamed: 0' in int_df.columns: int_df.drop('Unnamed: 0', axis=1, inplace=True) int_df = int_df.groupby(['subscriber_id', 'ddi_block_id']).size().reset_index()\ .rename(columns={0:'count'}) dataset = Dataset() ids = int_df['subscriber_id'].unique() items = int_df['ddi_block_id'].unique() dataset.fit(list(ids), list(items)) self.mapping = dataset.mapping() if self._use_weights: interactions = zip(int_df['subscriber_id'], int_df['ddi_block_id'], int_df['count']) else: interactions = zip(int_df['subscriber_id'], int_df['ddi_block_id']) (self._interactions, self._weights) = dataset.build_interactions(interactions) else: return self._interactions
def train_model(df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items ) (interactions, weights) = ds_full.build_interactions([(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM(no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit(interactions, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, _, business_id_map, _ = ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, item_dict
def __init__(self, dataset: Dataset) -> None: """ userid: user_id row: internal user id itemid: recipe_id column: internal recipe id """ userid2row, _, itemid2col, _ = dataset.mapping() self.userid2row = userid2row self.itemid2col = itemid2col # Invert dictionaries to get mapping in other direction self.row2userid = { value: key for key, value in self.userid2row.items() } self.col2itemid = {v: k for k, v in self.itemid2col.items()}
movies['year'] = years print(movies.head()) print(f'# Ratings: {len(ratings)}') print(f'# Users: {len(set(ratings["userId"]))}') last_user = sorted(list(set(ratings['userId'])))[-1] new_user = last_user + 1 print('Added new user: %s' % new_user) dataset = Dataset() dataset.fit(chain(ratings['userId'], [new_user]), movies['movieId'], item_features=(GENRES + list(movies['year']) + list(set(movies['movieId'])))) _, _, item_mapping, _ = dataset.mapping() rev_item_mapping = {y: x for (x, y) in item_mapping.items()} matches = [] for rid, row in movies.iterrows(): for m in match_lst: if m.lower() in row[1].lower(): matches.append(row[0]) print(good_ratings.head()) rating_iter = zip(good_ratings['userId'], good_ratings['movieId']) new_iter = ((new_user, x) for x in matches) interactions, weights = dataset.build_interactions(chain( rating_iter, new_iter)) print(repr(interactions))
print('Num users: {}, num_items {}.'.format(num_users, num_items)) # buil user features from users interests # user_features = dataset.build_user_features(((x['_id'], x['interests']) for x in full_users),normalize=False) # buil item features from users iterests # item_features = dataset.build_item_features(((x['_id'], x['subCategory']) for x in locations_data),normalize=False) # print(repr(item_features)) # with open('data.json', 'w') as outfile: # json.dump(dataset.mapping(), outfile) model = LightFM(loss='warp', no_components=30) model.fit(interactions[0]) train_auc = auc_score(model, interactions[0], num_threads=2).mean() print('Hybrid training set AUC: %s' % train_auc) # np.set_printoptions(threshold=np.inf) with open('virtual_mapping.json', 'w') as outfile: json.dump(dataset.mapping(), outfile) score = model.predict(182, np.arange(num_items)) pdb.set_trace() print(repr(score)) # np.set_printoptions(threshold=np.inf) ranked_items = np.argsort(-score) find_location_id(ranked_items)
def run_learning_curve(test_fraction, max_epoch): # create data_train data = Dataset(user_identity_features=True) # user featurs user_features, user_feature_names = get_user_features() # create map between user_id, post_id, user_features and internal indices data.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features) # print shape num_users, num_items = data.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions, weights) = data.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) print(repr(interactions)) # retrieve mapping from dataset user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping() # split test and train interaction_train, interaction_test = cross_validation.random_train_test_split(interactions, test_fraction) #--------------------------- # train model #--------------------------- model_cs = LightFM(learning_rate=0.05, loss='warp') model_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) precision_cs = [] precision_ws = [] recall_cs = [] recall_ws = [] for epoch in range(int(max_epoch/2)): model_cs.fit(interaction_train, epochs=int(epoch*2)) model_ws.fit(interaction_train, user_features=user_features, epochs=int(epoch*2)) # calculate precision and recall for each epoch precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test, interaction_train) precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test, interaction_train, user_features=user_features) recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test, interaction_train) recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test, interaction_train, user_features=user_features) # append to result precision_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs)) precision_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws)) recall_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs)) recall_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws)) df_result = pd.DataFrame({ "precision_cs": precision_cs, "precision_ws": precision_ws, "recall_cs": recall_cs, "recall_ws": recall_ws, }) # save to file df_result.to_csv("data/validation/df.epoch.csv", index=False) return
def run_validation(test_fraction, max_val): # containers to hold results ave_precision_at_k_cs = [] ave_recall_at_k_cs = [] ave_auc_score_cs = [] ave_precision_at_k_ws = [] ave_recall_at_k_ws = [] ave_auc_score_ws = [] # perform validation validation_itr = 0 while (validation_itr < max_val): print("Start validating cold, warm start, iteration %s" %validation_itr) # prevent random failure to abort entire job try: # count validation_itr += 1 # create data_train data_cs = Dataset() data_ws = Dataset(user_identity_features=True) # user featurs user_features, user_feature_names = get_user_features() print(user_feature_names) # create map between user_id, post_id, user_features and internal indices data_cs.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data())) data_ws.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features) # print shape num_users, num_items = data_ws.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions_cs, weights_cs) = data_cs.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) (interactions_ws, weights_ws) = data_ws.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) print(repr(interactions_ws)) # retrieve mapping from dataset user_id_map_cs, user_feature_map_cs, item_id_map_cs, item_feature_map_cs = data_cs.mapping() user_id_map_ws, user_feature_map_ws, item_id_map_ws, item_feature_map_ws = data_ws.mapping() # split test and train interaction_train_cs, interaction_test_cs = cross_validation.random_train_test_split(interactions_cs, test_fraction) interaction_train_ws, interaction_test_ws = cross_validation.random_train_test_split(interactions_ws, test_fraction) #--------------------------- # train model #--------------------------- model_cs = LightFM(learning_rate=0.05, loss='warp') model_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) model_cs.fit(interaction_train_cs, epochs=30) model_ws.fit(interaction_train_ws, user_features=user_features, epochs=30) #--------------------------- # make predictions #--------------------------- precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test_cs, interaction_train_cs) recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test_cs, interaction_train_cs) auc_score_cs = evaluation.auc_score(model_cs, interaction_test_cs, interaction_train_cs) precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) auc_score_ws = evaluation.auc_score(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) # append score from each iteration to results ave_precision_at_k_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs)) ave_recall_at_k_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs)) ave_auc_score_cs.append(sum(auc_score_cs) / len(auc_score_cs)) ave_precision_at_k_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws)) ave_recall_at_k_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws)) ave_auc_score_ws.append(sum(auc_score_ws) / len(auc_score_ws)) except: print("teration %s failed. Skipping.." %validation_itr) print("Validation score for test") print(ave_precision_at_k_cs ) print(ave_recall_at_k_cs ) print(ave_auc_score_cs ) print(ave_precision_at_k_ws ) print(ave_recall_at_k_ws ) print(ave_auc_score_ws ) df_result = pd.DataFrame({ 'precision_at_k_cs': ave_precision_at_k_cs, 'recall_at_k_cs': ave_recall_at_k_cs, 'auc_score_cs': ave_auc_score_cs, 'precision_at_k_ws': ave_precision_at_k_ws, 'recall_at_k_ws': ave_recall_at_k_ws, 'auc_score_ws': ave_auc_score_ws, }) # save to file df_result.to_csv("data/validation/df.csv", index=False) return
learning_rate=learning_rate, item_alpha=item_alpha, user_alpha=user_alpha) model.fit(interaction_matrix, sample_weight=interaction_weight, epochs=epochs, num_threads=4, verbose=True) print('[ %04ds ] Model fitted' % (time.time() - start_time)) recommendations = [] n_businesses = len(training_business_ids) # n_users = len(training_user_ids) best_k = 50 user_id_map, _, business_id_map, __ = dataset.mapping() business_ids_list = list(training_business_ids) training_business_indices = np.array( list(map(lambda id: business_id_map[id], business_ids_list))) user_seen_businesses = Review.extract_user_seen_business(training_set) print('[ %04ds ] Ready to produce recommendations' % (time.time() - start_time)) finished = 0 with open('user_list.json', 'r') as f: recommendation_user_list = json.load(f)['users'] n_users = len(recommendation_user_list) for user_id in recommendation_user_list: # user_recommendations = {'user_id': user_id, 'recommended_businesses': []}
def main(train_file, val_file, test_file, weight, output_file): # Read data from parquet print('Reading data ...') train_df = pd.read_parquet(train_file) val_df = pd.read_parquet(val_file) test_df = pd.read_parquet(test_file) train_df = train_df[['user_id', 'book_id', 'rating']] val_df = val_df[['user_id', 'book_id', 'rating']] test_df = test_df[['user_id', 'book_id', 'rating']] # Build the ID mappings print('Building the ID mappings ...') train = Dataset() train.fit((x for x in train_df.user_id), (x for x in train_df.book_id)) user_map = train.mapping()[0] item_map = train.mapping()[2] train_size = train.interactions_shape() with open(output_file, "a") as f: f.write( 'There are {} interactions in the training data, including {} users and {} items \n' .format(len(train_df), train_size[0], train_size[1])) print( 'There are {} interactions in the training data, including {} users and {} items' .format(len(train_df), train_size[0], train_size[1])) # Build the interactions matrix print('Building the interactions and weights matrix ...') if weight == 'True': train_df.rating = train_df.rating + 1 # use rating +1 as weights (train_int, train_weight) = train.build_interactions( ((i[1][0], i[1][1], i[1][2]) for i in train_df.iterrows())) else: (train_int, train_weight) = train.build_interactions( ((i[1][0], i[1][1]) for i in train_df.iterrows())) # filter out interactions with rating >= 3 as true label val_df = val_df[val_df.rating >= 3].reset_index(drop=True) val_user = np.array([user_map[i] for i in val_df.user_id]) val_item = np.array([item_map[i] for i in val_df.book_id]) val_data = val_df.rating val_int = coo_matrix((val_data, (val_user, val_item)), shape=train_size) test_df = test_df[test_df.rating >= 3].reset_index(drop=True) test_user = np.array([user_map[i] for i in test_df.user_id]) test_item = np.array([item_map[i] for i in test_df.book_id]) test_data = test_df.rating test_int = coo_matrix((test_data, (test_user, test_item)), shape=train_size) print('Running grid search on ranks and regularizations ...') ranks = [10, 20, 30] regs = [0, 1e-5, 5e-5] max_precision = -1 best_rank = None best_reg = None best_training_time = None best_eval_time = None best_model = None # Do grid search on ranks and regularizations using training and validation data for rank in ranks: for reg in regs: start_time = time.time() model = LightFM(no_components=rank, item_alpha=reg, user_alpha=reg, loss='warp', random_state=1211) # OPTIMIZE: precision@k model.fit(train_int, sample_weight=train_weight, epochs=10) train_end_time = time.time() val_precision = precision_at_k(model, val_int, train_interactions=train_int, k=500).mean() eval_end_time = time.time() with open(output_file, "a") as f: f.write( 'Rank %2d & Reg %.5f Validation Precision@500: %.5f \n' % (rank, reg, val_precision)) print('Rank %2d & Reg %.5f Validation Precision@500: %.5f' % (rank, reg, val_precision)) if val_precision > max_precision: max_precision = val_precision best_rank = rank best_reg = reg best_training_time = train_end_time - start_time best_eval_time = eval_end_time - train_end_time best_model = model # Evaluate best model performance on test set test_precision = precision_at_k(best_model, test_int, train_interactions=train_int, k=500).mean() with open(output_file, "a") as f: f.write( 'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f \n' % (best_rank, best_reg, test_precision)) f.write('The training takes %ss and evaluation takes %ss \n' % (best_training_time, best_eval_time)) print( 'The best model with rank %2d and reg %.5f achieves test precision@500 of %.5f' % (best_rank, best_reg, test_precision)) print('The training takes %ss and evaluation takes %ss' % (best_training_time, best_eval_time))
def init_movielens(path, min_rating=0.0, k=3, item_features=None, cluster_n=18, model='vgg19', test_percentage=0.2): valid_item_features = {'genres': 'genres', 'clusters': 'clusters'} if item_features is not None: assert all(item in valid_item_features.values() for item in item_features), \ 'Your specified item features is invalid. You have to use one or more of this: ' \ + ', '.join(valid_item_features) train_dataset = Dataset() test_dataset = Dataset() data = dict() min_interactions = dict() with open(path + '/ratings.csv', 'r') as ratings_file: reader = csv.reader( ratings_file, delimiter=',', ) next(reader) # skip header ratings = [] users = set() items = set() for row in reader: user_id = int(row[0]) item_id = int(row[1]) users.add(user_id) items.add(item_id) rating = float(row[2]) if rating >= min_rating: ratings.append((user_id, item_id, rating)) __add_interaction(min_interactions, user_id) __info_no_of_min_interactions( k, 'No of interactions per user overall ==> ', min_interactions) users = list(users) items = list(items) users_column, items_column, ratings_column = zip(*ratings) ratings = sparse.coo_matrix( (ratings_column, (users_column, items_column))) ratings_train, ratings_test = random_train_test_split( ratings, test_percentage=test_percentage, random_state=np.random.RandomState(7)) ratings_train_to_count = zip(ratings_train.row, ratings_train.col, ratings_train.data) ratings_train = zip(ratings_train.row, ratings_train.col, ratings_train.data) ratings_test_to_count = zip(ratings_test.row, ratings_test.col, ratings_test.data) ratings_test = zip(ratings_test.row, ratings_test.col, ratings_test.data) min_interactions = __count_train_test_min_interactions( ratings_train_to_count) __info_no_of_min_interactions( k, 'No of interactions per user on train ==> ', min_interactions) min_interactions = __count_train_test_min_interactions( ratings_test_to_count) __info_no_of_min_interactions( k, 'No of interactions per user on test ==> ', min_interactions) train_dataset.fit(users=users, items=items) test_dataset.fit(users=users, items=items) (train_interactions, train_weights) = train_dataset.build_interactions(ratings_train) (test_interactions, test_weights) = test_dataset.build_interactions(ratings_test) data.update({'train': train_interactions}) data.update({'test': test_interactions}) data.update({'train-mapping': train_dataset.mapping()}) # add item features if item_features is not None: aggregated_features = [] if valid_item_features.get('genres') in item_features: movie_genres, genres = __init_movies_genres(path) aggregated_features.append(movie_genres) train_dataset.fit_partial(item_features=genres) test_dataset.fit_partial(item_features=genres) train_dataset.fit_partial(items=list(movie_genres.keys())) test_dataset.fit_partial(items=list(movie_genres.keys())) if valid_item_features.get('clusters') in item_features: movies_posters_clusters, clusters = __init_movies_posters_clusters( path, cluster_n, model=model) aggregated_features.append(movies_posters_clusters) train_dataset.fit_partial(item_features=clusters) test_dataset.fit_partial(item_features=clusters) train_dataset.fit_partial( items=list(movies_posters_clusters.keys())) test_dataset.fit_partial( items=list(movies_posters_clusters.keys())) aggregated_features = __aggregate_features(aggregated_features) item_features = train_dataset.build_item_features( ((movie_id, aggregated_features.get(movie_id)) for movie_id in aggregated_features.keys())) _ = test_dataset.build_item_features( ((movie_id, aggregated_features.get(movie_id)) for movie_id in aggregated_features.keys())) data.update({'item_features': item_features}) else: data.update({'item_features': None}) return data
'writer_name']].drop_duplicates().reset_index(drop=True) users = uid[['uid', 'popular_section', 'popular_platform', 'popular_sources']].drop_duplicates() dataset = Dataset() features_list = create_feature_list(items, cols=['section_primary', 'writer_name']) user_features_list = create_feature_list( users, cols=['popular_section', 'popular_platform', 'popular_sources']) #features_list = list(set(items.writer_name.to_list())) dataset.fit(users=uid.uid.unique(), items=uid.article_id.unique(), item_features=features_list, user_features=user_features_list) (interactions, weights) = dataset.build_interactions( (x.uid, x.article_id) for x in uid.itertuples()) n_users, n_items = interactions.shape 1 - (interactions.getnnz() / (interactions.shape[0] * interactions.shape[1])) item_features = dataset.build_item_features([ (i.article_id, [i.section_primary, i.writer_name]) for i in items.itertuples() ]) user_features = dataset.build_user_features([(u.uid, [u.popular_section]) for u in users.itertuples()]) item_features = dataset.build_item_features(build_features(items)) user_features = dataset.build_user_features(build_features(users)) user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping( )
def predict_hard_users( train: pd.DataFrame, test: pd.DataFrame, genre: pd.DataFrame, education: pd.DataFrame, notices: pd.DataFrame, available_notices: set, applicant_notice: dict, header=None, ): user_feature = genre.merge(education, on="idpostulante", how="left") user_feature.drop(columns=["fechanacimiento"], inplace=True) user_feature_hard_user = user_feature[user_feature.idpostulante.isin( train.idpostulante)] uf = generate_features(user_feature[["sexo", "nombre", "estado"]]) itf = generate_features(notices[[ "nombre_zona", "tipo_de_trabajo", "nivel_laboral", "nombre_area" ]]) dataset1 = Dataset() dataset1.fit( train.idpostulante.unique(), # all the users notices.idaviso.unique(), user_features=uf, # additional user features item_features=itf, # additional item features ) # plugging in the interactions and their weights (interactions, weights) = dataset1.build_interactions([ (x[1], x[0], x[3]) for x in train.values ]) user_feature_list = generate_in_use_features( user_feature_hard_user[["sexo", "nombre", "estado"]].values, ["sexo", "nombre", "estado"], ) user_tuple = list( zip(user_feature_hard_user.idpostulante, user_feature_list)) user_features = dataset1.build_user_features(user_tuple, normalize=False) ( user_id_map, user_feature_map, item_id_map, item_feature_map, ) = dataset1.mapping() inv_item_id_map = {v: k for k, v in item_id_map.items()} # for component in [10, 35, 50, 80, 100, 200]: component = 35 model = lfm.LightFM(no_components=component, loss="warp", random_state=42) model.fit( interactions, # user_features=user_features, # sample_weight=weights, epochs=150, num_threads=8, verbose=True, ) test_precision = precision_at_k( model, interactions, # user_features=user_features, k=10, num_threads=8, ).mean() logger.info( f"Evaluation for LightFM is: {test_precision} with {component} number of component" ) final_predictions = {} for a_user in tqdm(test.idpostulante.unique()): try: notices_by_user = applicant_notice[a_user] except: notices_by_user = set() try: user_x = user_id_map[a_user] except: user_x = 0 n_users, n_items = interactions.shape prediction = np.argsort( model.predict( user_x, np.arange(n_items), # user_features=user_features, ))[::-1] prediction_for_user = [] for pred in prediction: notice = inv_item_id_map[pred] should_add = (notice in available_notices and notice not in notices_by_user) if should_add: prediction_for_user += [notice] if len(prediction_for_user) == 10: break final_predictions[a_user] = prediction_for_user write_dict(final_predictions, "lightfm", header) return ["lightfm"]
def train_model(): # uesr features user_features, user_feature_names = get_user_features() # create data data_ws = Dataset(user_identity_features=True) # warm start # create map between user_id, post_id, user_features and internal indices data_ws.fit((x['user_id'] for x in get_data()), (x['post_id'] for x in get_data()), user_features=user_features) #user_biases = #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions_ws, weights_ws) = data_ws.build_interactions( ((x['user_id'], x['post_id']) for x in get_data())) print(repr(interactions_ws)) # retrieve mapping from dataset user_id_map, user_feature_map, item_id_map, item_feature_map = data_ws.mapping( ) #--------------------------- # train model #--------------------------- # initialize model model_warp_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) # train model model_warp_ws.fit(interactions_ws, user_features=user_features, epochs=30) #--------------------------- # make predictions #--------------------------- # make predictions for all user prediction_ws = model_warp_ws.predict_rank(interactions_ws, user_features=user_features) # create identity matrix that represent user features of hypothetical user user_features_identity = sparse.csr_matrix( np.identity(len(user_feature_names))) # make prediction for hypothetical user prediction_hypo = [] for user_irt in range(len(user_feature_names)): # calculate prediction score prediction_score = model_warp_ws.predict( user_ids=0, item_ids=item_id_map.values(), user_features=user_features_identity) # combine prediction score with item map prediction_zipped = zip(prediction_score, item_id_map) # sort by prediction score prediction_sorted = sorted(prediction_zipped, key=lambda x: x[0], reverse=True) # add to list of hypothetical users prediction_hypo.append(prediction_sorted) return prediction_hypo, prediction_ws, user_id_map, item_id_map, user_feature_names
def lightfm_node(X1_train, X2_train, X1_test, X2_test): X2 = pd.concat([X2_train, X2_test]) X1 = pd.concat([X1_train, X1_test]).set_index('id') X1.columns = ['X1_' + i for i in X1.columns] X1['X1_5'] = pd.qcut(X1['X1_5'], np.arange(0, 1, 0.1), duplicates='drop') X1['X1_8'] = pd.qcut(X1['X1_8'], np.arange(0, 1, 0.1), duplicates='drop') X1['X1_6'] = pd.qcut(X1['X1_6'], np.arange(0, 1, 0.1), duplicates='drop') for col in ['X1_6', 'X1_8', 'X1_5', 'X1_1', 'X1_13']: X1[col] = X1[col].map(lambda x: '{' + col + '}_{' + str(x) + '}') X1 = X1.reset_index() from lightfm.data import Dataset dataset = Dataset() dataset.fit(users=(x for x in X2['id']), items=(x for x in X2['A'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_1'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_13'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_5'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_8'])) dataset.fit_partial(users=(x for x in X1['id']), user_features=(x for x in X1['X1_6'])) user_features = dataset.build_user_features( [(x[1]['id'], x[1][['X1_1', 'X1_13', 'X1_5', 'X1_8', 'X1_6' ]].values.tolist()) for x in X1.iterrows()], normalize=True) (interactions, weights) = dataset.build_interactions(zip(*X2[['id', 'A']].values.T)) model = LightFM(no_components=32, learning_rate=0.04, loss='bpr', max_sampled=55, random_state=0) num_epochs = 20 for i in range(num_epochs): model.fit_partial(interactions, user_features=user_features) users_mapping, user_features_mapping, assets_mapping, asset_features_mapping = dataset.mapping( ) user_features_mapping_inv = { j: i for i, j in user_features_mapping.items() } tag_embeddings = (model.user_embeddings.T / np.linalg.norm(model.user_embeddings, axis=1)).T lightfm_embed = pd.DataFrame(tag_embeddings[:len(users_mapping)], index=X1['id']) return lightfm_embed
print('Building LightFM Dataset...') print(50 * '-') lfm_dataset = Dataset(user_identity_features=False, item_identity_features=False) lfm_dataset.fit( users=u_list, items=i_list, user_features=np.concatenate((users.edad.drop_duplicates().values, users.sexo.drop_duplicates().values, users.educacion.drop_duplicates().values), axis=0) ) print('Retrieving internal mappings and dictionaries...') u_map, u_feat_map, i_map, i_feat_map = lfm_dataset.mapping() print(50 * '-') print('Building Interactions...') print(50 * '-') interactions = train.groupby(['idpostulante','idaviso']).agg('count').rename( columns={'fechapostulacion': 'rating'}).reset_index() #print(interactions.sort_values('rating', ascending=False).head()) interactions = np.array( [ interactions.idpostulante.values, interactions.idaviso.values, interactions.rating.values ], dtype=np.object).T
# creating the interaction matrix for the model (interactions, weights) = dataset.build_interactions( ((x['ProfielId'], x['VacatureId']) for x in qd.getMatchings())) #print(interactions.toarray()) # creating the item feature matrix for the model '''item_features = dataset.build_item_features(((x['VacatureId'], [x['Naam'],x['Taal'],x['Functie']]) for x in qd.getVacancies()),normalize=False) ''' item_features = dataset.build_item_features( ((x['VacatureId'], [x['Naam']]) for x in qd.getVacancies()), normalize=False) # print(item_features.toarray()) print(dataset.mapping()) ''' user_features = dataset.build_user_features(((x['Id'], [x['Motivatie']]) for x in qd.getProfiles())) print(user_features) ''' # Creating a user fettu # Split the set in train and test test, train = random_train_test_split(interactions, test_percentage=0.2, random_state=None) # Start training the model print("--- Start model training ---") model = LightFM(no_components=1, learning_rate=0.027, loss='warp')
def train_model( df, user_id_col='user_id', item_id_col='business_id', item_name_col='name_business', evaluate=True): """ Train the model using collaborative filtering. Args: df: the input dataframe. user_id_col: user id column. item_id_col: item id column. item_name_col: item name column. evaluate: if evaluate the model performance. Returns: model_full: the trained model. df_interactions: dataframe with user-item interactions. user_dict: user dictionary containing user_id as key and interaction_index as value. item_dict: item dictionary containing item_id as key and item_name as value. user_feature_map: the feature map of users business_feature_map: the feature map of items """ if evaluate: print('Evaluating model...') evaluate_model(df, user_id_col='user_id', item_id_col='business_id') print('Training model...') # build recommendations for known users and known businesses # with collaborative filtering method ds_full = Dataset() # we call fit to supply userid, item id and user/item features user_cols = ['user_id', 'average_stars'] categories = [c for c in df.columns if c[0].isupper()] item_cols = ['business_id', 'state'] for i in df.columns[10:]: item_cols.append(str(i)) user_features = user_cols[1:] item_features = item_cols[2:] ds_full.fit( df[user_id_col].unique(), # all the users df[item_id_col].unique(), # all the items user_features=user_features, # additional user features item_features=item_features ) df_users = df.drop_duplicates(user_id_col) # df_users = df[df.duplicated(user_id_col) == False] users_features = [] for i in range(len(df_users)): users_features.append(get_users_features_tuple(df_users.values[i])) users_features = ds_full.build_user_features( users_features, normalize=False) items = df.drop_duplicates(item_id_col) # items = df[df.duplicated(item_id_col) == False] items_features = [] for i in range(len(items)): items_features.append(get_items_features_tuple( items.values[i], categories)) items_features = ds_full.build_item_features( items_features, normalize=False) (interactions, weights) = ds_full.build_interactions( [(x[0], x[1], x[2]) for x in df.values]) # model model_full = LightFM( no_components=100, learning_rate=0.05, loss='warp', max_sampled=50) model_full.fit( interactions, user_features=users_features, item_features=items_features, sample_weight=weights, epochs=10, num_threads=10) # mapping user_id_map, user_feature_map, business_id_map, business_feature_map = \ ds_full.mapping() # data preparation df_interactions = pd.DataFrame(weights.todense()) df_interactions.index = list(user_id_map.keys()) df_interactions.columns = list(business_id_map.keys()) user_dict = user_id_map item_dict = df.set_index(item_id_col)[item_name_col].to_dict() return model_full, df_interactions, user_dict, \ item_dict, user_feature_map, business_feature_map
for u in users: #temp_df = pd.DataFrame(user_map[u]*k) scores = model.predict(u, litems) #sdict[user_map[u]] = [items_map[i] for i in np.argsort(-scores)[:k]] temp_df = pd.DataFrame({ 'user_id': [user_map[u]] * k, 'recom': [items_map[i] for i in np.argsort(-scores)[:k]] }) all_df = pd.concat([all_df, temp_df], ignore_index=True) return all_df #recom = predict(model,range(num_users),dataset.mapping()[0],dataset.mapping()[2]) manager = mp.Manager() sdict = manager.dict() predict_mp(model, num_users, dataset.mapping()[0], dataset.mapping()[2]) recom_df = pd.DataFrame(dict(sdict).items(), columns=['user_id', 'Recom']) recom_df = recom_df.explode('Recom').reset_index(drop=True) recom_df.to_csv('train_predictions.csv') """ df_item_features = df[["city_id","hotel_country"]].drop_duplicates() features_list = create_feature_list(df_item_features,cols=["hotel_country"]) fdataset = Dataset() # dataset.fit(df[USER_ID_COL].unique(), df[TARGET_COL].unique()) dataset.fit(df[USER_ID_COL], df[TARGET_COL],item_features=features_list) num_users, num_items = dataset.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items))