def nmf_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ nmf_ratings Predicates """ print("NMF predicates") nmf_model = NMF() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) nmf_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = nmf_model.predict(uid, iid).est write(predictions, 'nmf_rating_obs', fold, phase)
class DataLoader(object): def __init__(self,category,save_name): self.category=category self.max_user=10000 #maximum number of user self.price_dict={} self.price_dict_temp={} self.cate_dict={} self.cate_dict_temp={} self.top_value=15 # top x features in SVD self.model=NMF() self.topk=500 #maximum items in each category, finding the top k popular self.max_price={} self.save_path= os.path.join("..", "feature", save_name) if not os.path.isfile(self.save_path): self.load_data() #load raw data #self.create_user_item_matrix() self.create_ratings() self.gen_new_price_dict() self.save_data(self.save_path) #save the feature else: self.load(self.save_path) #load the feature def load_ratings(self, filename): with open(os.path.join("..", "data", filename), "rb") as f: ratings = pd.read_csv(f,names=("user","item","rating","timestamp")) return ratings def load_prices(self,filename): price_dict = {} num_no_price=0 for review in parse(os.path.join("..", "data", filename)): try: price=review['price'] asin=review['asin'] v=list(review['salesRank'].values())[0] if v<self.topk: price_dict[asin]=price except: num_no_price+=1 continue print("filename:",filename) print("length of price dict:", len(price_dict)) print("# of items without price", num_no_price) return price_dict def load_data(self): print("Loading data:") for i in self.category: ratings_name= "ratings_"+i+".csv" price_name="meta_"+i+".json.gz" ratings_temp=self.load_ratings(ratings_name) print(len(ratings_temp)) price_temp=self.load_prices(price_name) ratings_temp=ratings_temp[ratings_temp['item'].isin(price_temp.keys())] print(len(ratings_temp)) self.price_dict_temp.update(price_temp) self.max_price[i]=max(list(price_temp.values())) cate_temp={} for j in price_temp.keys(): cate_temp[j]=i self.cate_dict_temp.update(cate_temp) price_temp.clear() try: self.ratings=pd.merge(self.ratings,ratings_temp, how='outer') except: self.ratings=ratings_temp print(self.max_price) #old method def create_user_item_matrix(self, user_key="user",item_key="item"): n = len(set(self.ratings[user_key])) d = len(set(self.ratings[item_key])) self.user_mapper = dict(zip(np.unique(self.ratings[user_key]), list(range(n)))) self.item_mapper = dict(zip(np.unique(self.ratings[item_key]), list(range(d)))) self.user_inverse_mapper = dict(zip(list(range(n)), np.unique(self.ratings[user_key]))) self.item_inverse_mapper = dict(zip(list(range(d)), np.unique(self.ratings[item_key]))) self.user_ind = [self.user_mapper[i] for i in self.ratings[user_key]] self.item_ind = [self.item_mapper[i] for i in self.ratings[item_key]] self.ratings_matrix = sparse_matrix((self.ratings["rating"]-3, (self.user_ind, self.item_ind)), shape=(n,d)) print("user-item matrix generated.") def create_ratings(self): #C=MBRecsys(self.ratings_matrix,top_value) S=set(self.ratings['user']) S=sample(S,self.max_user) n = len(S) d = len(set(self.ratings['item'])) self.ratings=self.ratings[self.ratings['user'].isin(S)] reader=Reader(rating_scale=(1,5)) data = Dataset.load_from_df(self.ratings[['user', 'item', 'rating']], reader) train_set=data.build_full_trainset() self.model.fit(train_set) self.inv_cate_dict={} #{'categoryA':[],'categoryB':[]} for i in self.category: self.inv_cate_dict[i]=[] for j in train_set.all_items(): item_raw=train_set.to_raw_iid(j) self.inv_cate_dict[self.cate_dict_temp[item_raw]].append(j) self.price_dict[j]=self.price_dict_temp[item_raw] self.cate_dict[j]=self.cate_dict_temp[item_raw] self.cate_dict_temp.clear() self.price_dict_temp.clear() print("inv_cate_dict constructed.") d=0 for i in self.category: d+=len(self.inv_cate_dict[i]) print(i,':',len(self.inv_cate_dict[i])) self.ratings_predict=np.zeros([n,d]) for i in train_set.all_users(): user_raw=train_set.to_raw_uid(i) for j in train_set.all_items(): item_raw=train_set.to_raw_iid(j) self.ratings_predict[i][j]=self.model.predict(user_raw, item_raw)[3] print("predicted ratings generated.") self.ranking=np.zeros([n,d]) temp={} for i in range(n): for c in self.category: temp[c]=sorted(self.ratings_predict[i][self.inv_cate_dict[c]],reverse=True) for j in range(d): c=self.cate_dict[j] self.ranking[i][j]= temp[c].index(self.ratings_predict[i][j])+1 print("user_item rankings generated.") def save_data(self,save_path): self.dict_all={'prices':self.price_dict,#'raw_ratings':self.ratings_matrix, 'new_ratings':self.ratings_predict,'cate':self.cate_dict, 'rankings': self.ranking,'max_price':self.max_price, 'new_price':self.new_price_dict} #'user_mapper':self.user_mapper, 'item_mapper':self.item_mapper, #'user_inverse_mapper':self.user_inverse_mapper, 'item_inverse_mapper':self.item_inverse_mapper} with open(save_path,'wb') as f: pickle.dump(self.dict_all, f) print("data saved in ", save_path) def load(self,save_path): with open(save_path,'rb') as f: self.dict_all=pickle.load(f) #self.ratings_matrix =self.dict_all['raw_ratings'] self.ratings_predict=self.dict_all['new_ratings'] self.price_dict=self.dict_all['prices'] self.cate_dict=self.dict_all['cate'] self.ranking=self.dict_all['rankings'] self.max_price=self.dict_all['max_price'] self.new_price_dict=self.dict_all['new_price'] #self.user_mapper=self.dict_all['user_mapper'] #self.item_mapper=self.dict_all['item_mapper'] #self.user_inverse_mapper=self.dict_all['user_inverse_mapper'] #self.item_inverse_mapper=self.dict_all['item_inverse_mapper'] self.dict_all.clear() del self.dict_all print("Saved data loaded.") def gen_new_price_dict(self): self.new_price_dict={} for i in self.category: self.new_price_dict[i]={} for i in range(len(self.cate_dict)): self.new_price_dict[self.cate_dict[i]][i]=self.price_dict[i] print("new price dictionary generated.")