def test_random_train_test_split(test_percentage): data = fetch_movielens()["train"] train, test = random_train_test_split(data, test_percentage=test_percentage) assert test.nnz / float(data.nnz) == test_percentage _assert_disjoint(train, test)
def model(df, params, u=None, i=None): state = np.random.RandomState(params['seed']) data = Dataset() data.fit(df['userID'].unique(), df['poiID'].unique(), user_features=u[1] if u is not None else None, item_features=i[1] if i is not None else None) if u is not None: user_features_iterable = map(lambda l: (l[0], l[1]), u[0].iteritems()) user_features = data.build_user_features(user_features_iterable, normalize=False) else: user_features = None if i is not None: item_features_iterable = map(lambda l: (l[0], [l[1]]), i[0].iteritems()) item_features = data.build_item_features(item_features_iterable, normalize=False) else: item_features = None ratings, weights = data.build_interactions(df[['userID', 'poiID' ]].itertuples(index=False, name=None)) train, test = random_train_test_split(ratings, test_percentage=params['test'], random_state=state) lfm = LightFM(no_components=params['f'], learning_rate=params['lr'], loss=params['loss'], user_alpha=params['alpha'], random_state=state) lfm.fit(train, epochs=params['epochs'], user_features=user_features, item_features=item_features) return { 'pr-train': 100.0 * precision_at_k(lfm, train, k=params['k'], user_features=user_features, item_features=item_features).mean(), 'mrr-train': 100.0 * reciprocal_rank(lfm, train, user_features=user_features, item_features=item_features).mean(), 'pr-test': 100.0 * precision_at_k(lfm, test, k=params['k'], user_features=user_features, item_features=item_features).mean(), 'mrr-test': 100.0 * reciprocal_rank(lfm, test, user_features=user_features, item_features=item_features).mean() }
csr_data1, user_lookup1, item_lookup1 = create_sparse_matrix( traindata, user_key, item_key) #csr_data2, user_lookup2, item_lookup2 = create_sparse_matrix(testdata,user_key,item_key) user_items_train = csr_data1.T.tocsr() #user_items_test = csr_data2.T.tocsr() print(user_items_train) print('\n') #print(user_items_test) #print('\n') print(user_items_train.shape) #print(user_items_test.shape) print("Splitting the data into train/test set...\n") train, test = cross_validation.random_train_test_split(user_items_train) # print(train,test) # print(train.shape(),test.shape()) model1 = LightFM(learning_rate=0.05, loss='bpr') model2 = LightFM(learning_rate=0.05, loss='warp') print("Fitting models of BPR & WARP ranking losses...\n") model1.fit(train, epochs=10) model2.fit(train, epochs=10) #ranks = model.predict(user_items_train,num_threads=1) #print(ranks) res = model1.predict_rank(test) print(res) print("Evaluating methods...\n")
user_features = dataset.build_user_features(((x['User-ID'], [x['Age']]) for x in get_user_features())) labels = np.array([x['ISBN'] for x in get_ratings()]) ################################# # # # Training the Model # # # ################################# model = LightFM(loss='warp') (train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2) model.fit(train, item_features=item_features, user_features=user_features, epochs=2) ### model performnce evaluation #train_precision = precision_at_k(model, train,item_features=item_features, k=10).mean() #test_precision = precision_at_k(model, test, item_features=item_features,k=10).mean() #train_auc = auc_score(model, train,item_features=item_features).mean() #test_auc = auc_score(model, test,item_features=item_features).mean() #print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision)) #print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc)) #print("testing testing testing")
((x['VacatureId'], [x['Naam']]) for x in qd.getVacancies()), normalize=False) # print(item_features.toarray()) print(dataset.mapping()) ''' user_features = dataset.build_user_features(((x['Id'], [x['Motivatie']]) for x in qd.getProfiles())) print(user_features) ''' # Creating a user fettu # Split the set in train and test test, train = random_train_test_split(interactions, test_percentage=0.2, random_state=None) # Start training the model print("--- Start model training ---") model = LightFM(no_components=1, learning_rate=0.027, loss='warp') model.fit(train, item_features=item_features, epochs=100, num_threads=4, verbose=False) # model.fit(train,epochs=12,num_threads=4) modelnofeatures = LightFM(no_components=1, learning_rate=0.027, loss='warp') modelnofeatures.fit(train, epochs=100, num_threads=4, verbose=False)
values='playCountScaled') ratings = ratings_df.fillna(0).values sparsity = float(len( ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100 X = csr_matrix(ratings) n_users, n_items = ratings_df.shape user_ids = ratings_df.index.values artist_names = ap.sort_values("artistID")["name"].unique() Xcoo = X.tocoo() data = Dataset() data.fit(np.arange(n_users), np.arange(n_items)) interactions, weights = data.build_interactions( zip(Xcoo.row, Xcoo.col, Xcoo.data)) train, test = random_train_test_split(interactions) model = LightFM(learning_rate=0.05, loss='warp') model.fit(train, epochs=10, num_threads=2) # Generating the list of artists at start-up: artIDs = ap['artistID'].unique() numarts = len(ap['artistID'].unique()) listart = "" for it, artName in enumerate(ap['name'].unique()): listart = listart + '<input type="checkbox" name="' + str( artIDs[it]) + '" value="' + str(artName) + '">' + artName + '<br>' # get_recommendation from Jupyter notebook: def get_recommendation(userid, ratings=ratings):
NUM_EPOCHS = num_epochs ITEM_ALPHA = 1e-6 # Recommended by LightFM # Let's fit a WARP model: these generally have the best performance. model = LightFM(loss='warp', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS) # Fit model model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS) return model if __name__ == "__main__": hike_user_rating_matrix = gen_collabfilt_matrix( hike_data) # generate interaction matrix df = convert_to_binary(hike_user_rating_matrix, 2.5) # binarize interaction matrix # Fit model dataset, interactions = lightfm_implicit_matrix(interaction_matrix) # Create training/test set train, test = cross_validation.random_train_test_split( interactions, test_percentage=0.2, random_state=np.random.RandomState(seed=1)) #Train model model = lightfm_train(train, 30, 30) print('Great job! You trained your model!')
def run_learning_curve(test_fraction, max_epoch): # create data_train data = Dataset(user_identity_features=True) # user featurs user_features, user_feature_names = get_user_features() # create map between user_id, post_id, user_features and internal indices data.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features) # print shape num_users, num_items = data.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions, weights) = data.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) print(repr(interactions)) # retrieve mapping from dataset user_id_map, user_feature_map, item_id_map, item_feature_map = data.mapping() # split test and train interaction_train, interaction_test = cross_validation.random_train_test_split(interactions, test_fraction) #--------------------------- # train model #--------------------------- model_cs = LightFM(learning_rate=0.05, loss='warp') model_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) precision_cs = [] precision_ws = [] recall_cs = [] recall_ws = [] for epoch in range(int(max_epoch/2)): model_cs.fit(interaction_train, epochs=int(epoch*2)) model_ws.fit(interaction_train, user_features=user_features, epochs=int(epoch*2)) # calculate precision and recall for each epoch precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test, interaction_train) precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test, interaction_train, user_features=user_features) recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test, interaction_train) recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test, interaction_train, user_features=user_features) # append to result precision_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs)) precision_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws)) recall_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs)) recall_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws)) df_result = pd.DataFrame({ "precision_cs": precision_cs, "precision_ws": precision_ws, "recall_cs": recall_cs, "recall_ws": recall_ws, }) # save to file df_result.to_csv("data/validation/df.epoch.csv", index=False) return
resume_text = pd.read_csv("~/data/Candidate Report_tokenized.csv").fillna('') ####### prepare item features and user features # item features resume_embeddings.set_index("ID", inplace=True) resume_features_sparse = sparse.csr_matrix(resume_embeddings.values) job_embeddings.set_index("ID", inplace=True) job_features_sparse = sparse.csr_matrix(job_embeddings.values) # read the interaction matrix # interaction_sparse = sparse.load_npz('data/interaction_v4.npz') interaction_sparse = sparse.load_npz('data/interaction_v5.npz') interaction_sparse.data = np.nan_to_num(interaction_sparse.data, copy=False) # train test split for cv train, test = random_train_test_split(interaction_sparse, test_percentage=0.3, random_state = None) # free memory del job_embeddings del resume_embeddings del interaction_sparse gc.collect() ##### create and train LightFM model ###### NUM_THREADS = 4 NUM_COMPONENTS = 30 NUM_EPOCHS = 50 ITEM_ALPHA = 1e-6 K_num = 5 model = LightFM(loss='warp'
def preprocess(): import pandas as pd import math import numpy as np data_users = pd.read_csv('users_tag.csv',index_col=0) data_business = pd.read_csv('business_Nora.csv',index_col=0) data_review = pd.read_csv('reviews_cleaned.csv',index_col = 0) data_users.review_count = pd.Series([math.log(x+1) for x in data_users.review_count]) data_users.useful = pd.Series([math.log(x+1) for x in data_users.useful]) #cleam business skewness data_business.review_count = pd.Series([math.log(x+1) for x in data_business.review_count]) from lightfm.data import Dataset #model establishment dataset = Dataset() dataset.fit(data_review.user_id,data_review.business_id) type(dataset) num_users, num_items = dataset.interactions_shape() # fit item and user features. dataset.fit_partial(items=data_business.business_id, item_features=['stars']) dataset.fit_partial(items=data_business.business_id, item_features=['review_count']) tar_cols = [x for x in data_business.columns[24:]] dataset.fit_partial(items = data_business.business_id, item_features = tar_cols) user_cols = [x for x in data_users[['review_count', 'useful', 'Ice Cream & Frozen Yogurt', 'Korean', 'Tapas/Small Plates', 'Vietnamese', 'Vegan', 'Caribbean', 'Food Delivery Services', 'Lounges', 'Pubs', 'Greek', 'Cocktail Bars', 'Mexican', 'Wine Bars', 'Tea Rooms', 'Delis', 'Vegetarian', 'Ethnic Food', 'Salad', 'Seafood', 'Beer', 'American (New)', 'Juice Bars & Smoothies', 'Shopping', 'Barbeque', 'Sports Bars', 'French', 'Chicken Wings', 'Gastropubs', 'Diners', 'Gluten-Free', 'Thai', 'Comfort Food', 'Health Markets', 'Halal', 'Caterers', 'Arts & Entertainment']]] dataset.fit_partial(users=data_users.user_id, user_features = user_cols) print("Building Interactions") (interactions, weights) = dataset.build_interactions([(x['user_id'], x['business_id'], x['stars']) for index,x in data_review.iterrows()]) print("Interactions Build") # build user and item features def build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst def user_build_dict(df,tar_cols,val_list): rst = {} for col in tar_cols: rst[col] = df[col] sum_val = sum(list(rst.values())) # get sum of all the tfidf values if(sum_val == 0): return rst else: w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1 for key,value in rst.items(): rst[key] = value * w return rst # get max of each column to regularize value to [0,1] max_star = max(data_business.stars) max_b_rc = max(data_business.review_count) print('max_b_rc') print(max_b_rc) # give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25 item_features = dataset.build_item_features(((x['business_id'], {'stars':0.5*x['stars']/max_star, 'review_count':0.5*x['review_count']/max_b_rc, **build_dict(x,tar_cols,[0.5*x['stars']/max_star, 0.5*x['review_count']/max_b_rc])}) for index,x in data_business.iterrows())) # user_features = dataset.build_user_features(((x['user_id'], # [x['is_elite'],x['year']]) # for index, x in data_users.iterrows())) max_u_rc = max(data_users.review_count) max_useful = max(data_users.useful) user_features = dataset.build_user_features(((x['user_id'], {'review_count':0.35*x['review_count']/max_u_rc, 'useful':0.35*x['useful']/max_useful, **user_build_dict(x,user_cols,[0.35*x['review_count']/max_u_rc,0.35*x['useful']/max_useful])}) for index, x in data_users.iterrows())) #train-test split # seed = 12345 #has multiple seeds set up to account for split biases # seed = 101 # seed = 186 seed = 123 from lightfm.cross_validation import random_train_test_split train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed)) print('The dataset has %s users and %s items, ' 'with %s interactions in the test and %s interactions in the training set.' % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz())) train.multiply(test).nnz == 0 # make sure train and test are truly disjoint return train,test,data_business,dataset,user_features,item_features
def auto_tune_parameter(k, interactions, model, data, param1, param_type="components", user_features=None, item_features=None): """ Function that identifies the optimal values of parameters which maximizes performance of model Parameters: - k: Number of folds used to tune parameters - interactions: matrix of interactions between users and artists - model: specified model used in recommender system - data: sparse user-item matrix - param1: list of values to try for hyperparameter - param_type: name of the parameter we want to optimize; options are: - "components" - "learning rate" - "loss function" - user_features: parameter used for evaluating and fitting the LightFM model. - item_features: parameter used for evaluating and fitting the LightFM model. Ouput: - max_recall_list: a list of k tuples, one for each fold. each tuple is in the form (max_recall,max_first_param,max_precision,max_coverage) which records the best recall, and the param that achieved it, and the max_precision and max_coverage achieved (which may be from different param values). - heatmap_list: a list of k heatmaps of the recall values for the tested parameter (one heatmap per fold). Useful for visualizations """ # Train model # Create list of MAX Recall depending on # params max_recall_list = [ ] # will end up being length k list of tuples of best param values heatmap_list = [] train_and_tune, test = cross_validation.random_train_test_split( data, test_percentage=.2, random_state=None) train_list = [] tune_list = [] for i in range(k): trainvals, tunevals = cross_validation.random_train_test_split( train_and_tune, test_percentage=.2, random_state=None) train_list.append(trainvals) tune_list.append(tunevals) test_recall = 0 test_first_param = param1[0] # create recall matrix storing for each combination of params for fold in range( k): # For each fold; there are k-1 folds within train_and_tune recall_heatmap = [0 for y in range(len(param1))] train = train_list[fold] tune = tune_list[fold] # initialize best value of first_param for this fold max_first_param = param1[0] max_recall = 0 max_precision = 0 max_coverage = 0 value1_index = 0 # index for heatmap print("Fitting fold number...", fold) for value1 in param1: print("Trying ", (value1)) if param_type == "components": usemodel = model(learning_rate=0.05, no_components=value1, loss='warp') elif param_type == "learning_rate": usemodel = model(learning_rate=value1, no_components=50, loss='warp') elif param_type == "loss_function": usemodel = model(learning_rate=0.05, no_components=50, loss=value1) usemodel.fit(train, user_features=user_features, item_features=item_features, epochs=25) coverage, precision, recall = evaluate_lightfm( usemodel, data, train, tune, item_features=item_features, user_features=user_features) print(value1_index) recall_heatmap[value1_index] = recall # update heatmap # update maximum values max_precision = max(max_precision, precision) max_coverage = max(max_coverage, coverage) if recall > max_recall: max_recall = recall max_first_param = value1 value1_index = value1_index + 1 max_recall_list.append( [max_recall, max_first_param, max_precision, max_coverage]) if max_recall > test_recall: print("Fold ", fold, " beat the record for recall!") print("New best recall is ", max_recall) print("New best param is ", (max_first_param)) test_recall = max_recall test_first_param = max_first_param heatmap_list.append(recall_heatmap) print("end of fold---------------------------") # Now, test_first_param should be optimized if param_type == "components": usemodel = model(learning_rate=0.05, no_components=test_first_param, loss='warp') elif param_type == "learning_rate": usemodel = model(learning_rate=test_first_param, no_components=50, loss='warp') elif param_type == "loss_function": usemodel = model(learning_rate=0.05, no_components=50, loss=test_first_param) usemodel.fit(train_and_tune, user_features=user_features, item_features=item_features, epochs=25) final_coverage, final_precision, final_recall = evaluate_lightfm( usemodel, data, train_and_tune, test, user_features=user_features, item_features=item_features) print("The recall on the test set is ", final_recall, ", after hyperparameter optimization") print("The precision on the test set is ", final_precision, ", after hyperparameter optimization") print("The coverage on the test set is ", final_coverage, ", after hyperparameter optimization") return max_recall_list, heatmap_list
temp_rows,temp_columns,rate=line.split("\t") rows=max(rows,int(temp_rows)) columns=max(columns,int(temp_columns)) arr_train=np.zeros([rows+1,columns+1]) for line in lines: line=line.strip() temp_rows,temp_columns,rate=line.split("\t") if rate=="1": #print(temp_rows,temp_columns) arr_train[int(temp_rows),int(temp_columns)]=1 df_data=pd.DataFrame(arr_train,index=list(range(rows+1)),columns=list(range(columns+1))) data1 = csr_matrix(df_data) data1.toarray() train,test=random_train_test_split(data1,test_percentage=0.4, random_state=np.random.RandomState(1)) arr_itemfeature=np.load("entity_embedding.npy")[:columns+1,:] #arr_itemfeature*=10 df_itemfeature=pd.DataFrame(arr_itemfeature,index=list(range(columns+1)),columns=list(range(20))) data_feature=csr_matrix(df_itemfeature) data_feature.toarray() #model.fit(train,item_features=data_feature,epochs=50,verbose=True) model.fit(train,epochs=10) #print(auc_score(model,test,item_features=data_feature).mean()) print(auc_score(model,test).mean()) y_true=[] y_predict=[] max_rate=0
def train_val_split(csr_mat): train, val = random_train_test_split(csr_mat, test_percentage=0.2) return (train, val)
def randomized_search(self, params, metric='auc', max_iterations=None, max_epochs=50, early_stopping=False, use_weights=False): """ Standard randomized search method to select the hyper-parameters that result in the highest score on the test set. Each iteration will sample one of the possible combinations of hyper-parameters. Uses ParameterGrid class from scikit-learn in order to create an iterable of all possible hyper-parameter combinations. The user can supply a max_iterations value that will stop the search once said number of combinations has been reached. Furthermore, early_stopping can be set to True to stop the training of a particular model when the test score has stopped improving, which is particularly useful when overfitting. :param params:(dict, required) - dictionary of parameters to test, {parameter: [list of values to try]} :param metric:(string, optional) - metric to use to pick the best model :param max_iterations:(int, optional) - if provided, the hyper-parameter optimization will stop after this many tests, irrespective of len(ParameterGrid(params)) :param max_epochs:(int, optional) - max number of epochs to train each model :param early_stopping:(bool, optional) - if True, the training of a model will be partial and will stop after 5 epochs of non-improvement on the test score; the model will then be re-trained using the optimal number of epochs :param use_weights:(bool, optional) - if True, the training procedure will use weights to value repeated interactions more """ # Raise an error if any of the parameters supplied is not one of the arguments used by self.init_model valid_params = self.init_model.__code__.co_varnames if any([x not in valid_params for x in params.keys()]): raise ValueError( "One of the hyper-parameters supplied is invalid. Please make sure there are no typos." ) # Reset best values self.best_model = None self.best_params = None self.best_score = 0 # create train and test datasets (train_set, test_set) = random_train_test_split(self._interactions, test_percentage=0.2) if use_weights and self._weights is not None: weights_csr = self._weights.tocsr() data = [ weights_csr[u, i] for u, i in zip(train_set.row, train_set.col) ] train_weights = sp.coo_matrix( (data, (train_set.row, train_set.col)), shape=self._weights.shape, dtype=self._weights.dtype) else: train_weights = None # Create ParameterGrid instance to be iterated and cast it to list grid = list(ParameterGrid(params)) # If max_iterations has not been provided then test all parameter combinations if not max_iterations: max_iterations = len(grid) # Shuffle the list and pop out and remove the last element random.shuffle(grid) test_params = grid.pop() test_params_idx = 1 start_time = time.time() while test_params and test_params_idx <= max_iterations: # Initialize model with current combination of hyper-parameters to be tested self.init_model(**test_params) if early_stopping: best_iter = 0 best_score = 0 iters_no_improvement = 0 # Train the model for max_epochs, evaluating it at each step for i in range(max_epochs): self.train(train_set, sample_weight=train_weights, partial=True) test_score = self.evaluate_model(self.model, metric, test_set, train_set) if test_score > best_score: best_iter = i + 1 best_score = test_score iters_no_improvement = 0 else: iters_no_improvement += 1 # If the test score has not improved in the last 5 epochs stop the training if iters_no_improvement == 5: break # If the last epoch did not result in the highest test score, re-train the model for the optimal number # of epochs if best_iter != max_epochs: self.init_model(**test_params) self.train(train_set, sample_weight=train_weights, epochs=best_iter) test_score = self.evaluate_model(self.model, metric, test_set, train_set) else: self.train(train_set, sample_weight=train_weights, epochs=max_epochs) test_score = self.evaluate_model(self.model, metric, test_set, train_set) # If the test score achieved by this model was the highest so far, set the class variables accordingly if test_score > self.best_score: self.best_model = self.model self.best_params = test_params self.best_score = test_score random.shuffle(grid) if grid: test_params = grid.pop() else: test_params = None elapsed_time = (time.time() - start_time) / 60 print( 'Hyperparameters tested: {}/{}; {} score: {}; total time: {:.2f} minutes' .format(test_params_idx, max_iterations, metric, test_score, elapsed_time)) test_params_idx += 1 print( 'The best model achieved a {} score of {} on the test set, with parameters {}' .format(metric, self.best_score, self.best_params))
""" """# Recommender System ### LightFM Implementation """ pip install -qq lightfm from lightfm import LightFM from lightfm.cross_validation import random_train_test_split from lightfm.evaluation import * # Split interactions to train and test sets train, test = random_train_test_split(interactions,test_percentage=0.1,random_state=42) """##### Define LightFM model""" hybrid = LightFM(no_components=32,random_state=42,loss='warp',item_alpha=1e-06,user_alpha=1e-06) hybrid.fit(train,user_features,item_features,epochs=10,num_threads=4,verbose=True) """##### Evaluation: AUC score """ hybrid_train_auc = auc_score(hybrid,train,item_features=item_features,user_features=user_features,num_threads=4) hybrid_test_auc = auc_score(hybrid,test,train_interactions=train,item_features=item_features,user_features=user_features,num_threads=4) print('Hybrid model train AUC score: %.5f' %hybrid_train_auc.mean()) print('Hybrid model test AUC score: %.5f' %hybrid_test_auc.mean())
def run_validation(test_fraction, max_val): # containers to hold results ave_precision_at_k_cs = [] ave_recall_at_k_cs = [] ave_auc_score_cs = [] ave_precision_at_k_ws = [] ave_recall_at_k_ws = [] ave_auc_score_ws = [] # perform validation validation_itr = 0 while (validation_itr < max_val): print("Start validating cold, warm start, iteration %s" %validation_itr) # prevent random failure to abort entire job try: # count validation_itr += 1 # create data_train data_cs = Dataset() data_ws = Dataset(user_identity_features=True) # user featurs user_features, user_feature_names = get_user_features() print(user_feature_names) # create map between user_id, post_id, user_features and internal indices data_cs.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data())) data_ws.fit((x['user_id'] for x in get_data()),(x['post_id'] for x in get_data()), user_features=user_features) # print shape num_users, num_items = data_ws.interactions_shape() print('Num users: {}, num_items {}.'.format(num_users, num_items)) #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions_cs, weights_cs) = data_cs.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) (interactions_ws, weights_ws) = data_ws.build_interactions(((x['user_id'], x['post_id'])) for x in get_data()) print(repr(interactions_ws)) # retrieve mapping from dataset user_id_map_cs, user_feature_map_cs, item_id_map_cs, item_feature_map_cs = data_cs.mapping() user_id_map_ws, user_feature_map_ws, item_id_map_ws, item_feature_map_ws = data_ws.mapping() # split test and train interaction_train_cs, interaction_test_cs = cross_validation.random_train_test_split(interactions_cs, test_fraction) interaction_train_ws, interaction_test_ws = cross_validation.random_train_test_split(interactions_ws, test_fraction) #--------------------------- # train model #--------------------------- model_cs = LightFM(learning_rate=0.05, loss='warp') model_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) model_cs.fit(interaction_train_cs, epochs=30) model_ws.fit(interaction_train_ws, user_features=user_features, epochs=30) #--------------------------- # make predictions #--------------------------- precision_at_k_cs = evaluation.precision_at_k(model_cs, interaction_test_cs, interaction_train_cs) recall_at_k_cs = evaluation.recall_at_k(model_cs, interaction_test_cs, interaction_train_cs) auc_score_cs = evaluation.auc_score(model_cs, interaction_test_cs, interaction_train_cs) precision_at_k_ws = evaluation.precision_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) recall_at_k_ws = evaluation.recall_at_k(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) auc_score_ws = evaluation.auc_score(model_ws, interaction_test_ws, interaction_train_ws, user_features=user_features) # append score from each iteration to results ave_precision_at_k_cs.append(sum(precision_at_k_cs) / len(precision_at_k_cs)) ave_recall_at_k_cs.append(sum(recall_at_k_cs) / len(recall_at_k_cs)) ave_auc_score_cs.append(sum(auc_score_cs) / len(auc_score_cs)) ave_precision_at_k_ws.append(sum(precision_at_k_ws) / len(precision_at_k_ws)) ave_recall_at_k_ws.append(sum(recall_at_k_ws) / len(recall_at_k_ws)) ave_auc_score_ws.append(sum(auc_score_ws) / len(auc_score_ws)) except: print("teration %s failed. Skipping.." %validation_itr) print("Validation score for test") print(ave_precision_at_k_cs ) print(ave_recall_at_k_cs ) print(ave_auc_score_cs ) print(ave_precision_at_k_ws ) print(ave_recall_at_k_ws ) print(ave_auc_score_ws ) df_result = pd.DataFrame({ 'precision_at_k_cs': ave_precision_at_k_cs, 'recall_at_k_cs': ave_recall_at_k_cs, 'auc_score_cs': ave_auc_score_cs, 'precision_at_k_ws': ave_precision_at_k_ws, 'recall_at_k_ws': ave_recall_at_k_ws, 'auc_score_ws': ave_auc_score_ws, }) # save to file df_result.to_csv("data/validation/df.csv", index=False) return
, how="left" , on="ID") pos4_full_tfidf.drop_duplicates(subset="ID", inplace=True) pos4_full_tfidf = pos4_full_tfidf.fillna(value=0) pos5_full_tfidf = pos5_tfidf.merge(all_dummies , how="left" , on="ID") pos5_full_tfidf.drop_duplicates(subset="ID", inplace=True) pos5_full_tfidf = pos5_full_tfidf.fillna(value=0) ### Convert data to sparse matrix and split for cv### pos1_spr = sp.sparse.csr_matrix(pos1_full_tfidf.set_index("ID").values) pos1_train, pos1_test = random_train_test_split(pos1_spr , test_percentage=0.25 , random_state = None) ### create and train LightFM model ### NUM_THREADS = 4 NUM_COMPONENTS = 5 NUM_EPOCHS = 30 ITEM_ALPHA = 1e-6 pos1_model = LightFM(loss='warp' , item_alpha=ITEM_ALPHA , no_components=NUM_COMPONENTS) %time pos1_model = pos1_model.fit(pos1_train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)
def init_movielens(path, min_rating=0.0, k=3, item_features=None, cluster_n=18, model='vgg19', test_percentage=0.2): valid_item_features = {'genres': 'genres', 'clusters': 'clusters'} if item_features is not None: assert all(item in valid_item_features.values() for item in item_features), \ 'Your specified item features is invalid. You have to use one or more of this: ' \ + ', '.join(valid_item_features) train_dataset = Dataset() test_dataset = Dataset() data = dict() min_interactions = dict() with open(path + '/ratings.csv', 'r') as ratings_file: reader = csv.reader( ratings_file, delimiter=',', ) next(reader) # skip header ratings = [] users = set() items = set() for row in reader: user_id = int(row[0]) item_id = int(row[1]) users.add(user_id) items.add(item_id) rating = float(row[2]) if rating >= min_rating: ratings.append((user_id, item_id, rating)) __add_interaction(min_interactions, user_id) __info_no_of_min_interactions( k, 'No of interactions per user overall ==> ', min_interactions) users = list(users) items = list(items) users_column, items_column, ratings_column = zip(*ratings) ratings = sparse.coo_matrix( (ratings_column, (users_column, items_column))) ratings_train, ratings_test = random_train_test_split( ratings, test_percentage=test_percentage, random_state=np.random.RandomState(7)) ratings_train_to_count = zip(ratings_train.row, ratings_train.col, ratings_train.data) ratings_train = zip(ratings_train.row, ratings_train.col, ratings_train.data) ratings_test_to_count = zip(ratings_test.row, ratings_test.col, ratings_test.data) ratings_test = zip(ratings_test.row, ratings_test.col, ratings_test.data) min_interactions = __count_train_test_min_interactions( ratings_train_to_count) __info_no_of_min_interactions( k, 'No of interactions per user on train ==> ', min_interactions) min_interactions = __count_train_test_min_interactions( ratings_test_to_count) __info_no_of_min_interactions( k, 'No of interactions per user on test ==> ', min_interactions) train_dataset.fit(users=users, items=items) test_dataset.fit(users=users, items=items) (train_interactions, train_weights) = train_dataset.build_interactions(ratings_train) (test_interactions, test_weights) = test_dataset.build_interactions(ratings_test) data.update({'train': train_interactions}) data.update({'test': test_interactions}) data.update({'train-mapping': train_dataset.mapping()}) # add item features if item_features is not None: aggregated_features = [] if valid_item_features.get('genres') in item_features: movie_genres, genres = __init_movies_genres(path) aggregated_features.append(movie_genres) train_dataset.fit_partial(item_features=genres) test_dataset.fit_partial(item_features=genres) train_dataset.fit_partial(items=list(movie_genres.keys())) test_dataset.fit_partial(items=list(movie_genres.keys())) if valid_item_features.get('clusters') in item_features: movies_posters_clusters, clusters = __init_movies_posters_clusters( path, cluster_n, model=model) aggregated_features.append(movies_posters_clusters) train_dataset.fit_partial(item_features=clusters) test_dataset.fit_partial(item_features=clusters) train_dataset.fit_partial( items=list(movies_posters_clusters.keys())) test_dataset.fit_partial( items=list(movies_posters_clusters.keys())) aggregated_features = __aggregate_features(aggregated_features) item_features = train_dataset.build_item_features( ((movie_id, aggregated_features.get(movie_id)) for movie_id in aggregated_features.keys())) _ = test_dataset.build_item_features( ((movie_id, aggregated_features.get(movie_id)) for movie_id in aggregated_features.keys())) data.update({'item_features': item_features}) else: data.update({'item_features': None}) return data
from lightfm.evaluation import precision_at_k from scipy.sparse import identity from sklearn.model_selection import train_test_split from tqdm import tqdm from logger import Logger #%%[markdown] # # > Preparation #%%[markdown] # ## >> Load data # In[ ]: ratings_pivot_csr_filename = "data/intersect-20m/ratings.csr" ratings_pivot = pickle.load(open(ratings_pivot_csr_filename, 'rb')) #%%[markdown] # ## >> Split data train, test = random_train_test_split(ratings_pivot, test_percentage=0.2) # %%[markdown] # ## >> User & Item features # Identity matrix to represent users and items feature # In[ ]: user_identity = identity(train.shape[0]) item_identity = identity(train.shape[1]) #%%[markdown] # ## >> Set logger timestamp = str(datetime.timestamp(datetime.now())) logger = Logger() session_log_path = "log/{}/".format(timestamp) logger.create_session_folder(session_log_path) logger.set_default_filename(session_log_path + "log.txt") # %%[markdown]