def file_parsing(self): """ 读取输入数据,并作文本解析、词频统计、实体统计。 并根据阈值去掉低频词与低频实体,并生成词索引与实体索引。 """ with timer("File Parsing", verbose=True): print("**** Starting Parsing Input Files! ****") for file in self.input_files: with open(file, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: user, title, label, entities = line.split("\t") content: List[str] = title.split() # collect corpus self.corpus.append(content) # collect word in title to summarise count for s in content: self.word2freq[s] += 1 # collect entity in entities to summarise count for pair in entities.split(";"): ent_id, ent_name = pair.split(":") self.entity2freq[int(ent_id)] += 1 # 生成word2index index = 1 # 起始从 1 开始, 0 for dummy for word, freq in self.word2freq.items(): if freq >= self.min_word_count: self.word2index[word] = index index += 1 # 生成entity2index index = 1 for ent_id, freq in self.entity2freq.items(): if freq >= self.min_entity_count: self.entity2index[int(ent_id)] = index index += 1 print("Succeed in Parsing Input Files!") print("Words num: %d.\tEntity num: %d" % (len(self.word2index), len(self.entity2index)))
def transform(self, input_file: str, output_file: str): """ 对 input_file的标题进行 index处理,生成 word_index_encoding 和 entity_index_encoding 并输出至output_file :param input_file: 输入数据地址 :param output_file: 输出数据地址 """ with timer("Transform", True): print("**** Starting Transform %s ****" % input_file) with open(input_file, "r", encoding="utf-8") as fr, open(output_file, "w", encoding="utf-8") as fw: for line in fr: line = line.strip() if line: user, title, label, entities = line.split("\t") word_encoding, entity_encoding = self._encoding_title( title, entities) # 从entity中获取特征 content = "\t".join( [user, word_encoding, entity_encoding, label]) fw.write(content + "\n") print("Transformation Done!")
for item in self.user_recall_items[user]: score = self.predict(user, item) score = 1 / (1 + math.exp(-score)) result.push(score, item) return result.queue() def save(self): LOGGER.info(f"Save model to `{self.model_file}`") with open(self.model_file, "wb") as f: pickle.dump((self.P, self.Q), f, protocol=pickle.HIGHEST_PROTOCOL) def load(self): LOGGER.info(f"Load model from `{self.model_file}`") with open(self.model_file, "rb") as f: self.P, self.Q = pickle.load(f) if __name__ == '__main__': data = read_file(os.path.join(MOVIE_LENS_SRC, "ratings.dat")) train, test = split(data, seed=0, test_size=0.1) lfm = LFMRecommend() lfm.fit(train) n_recall = 10 with timer("Recommend"): print(lfm.recommend("6027", n_recall)) precision, recall = evaluate(lfm, test, n_recall_items=n_recall) print(f"Precision: {precision}, Recall: {recall}." ) # Precision: 0.188542, Recall: 0.11229.
return similarity def user_item_score(self, user: str, item: str) -> float: """ Calculate the recommend score between specified user and item. For example, the score computation between user `C` and item `a` is as follows: Score(C, a) = sum([Score(U, a) * Sim(U, C) for U in Users-besides-C]) """ score = 0.0 user_sim = self.users_sim[user] for user_, sim in user_sim.items(): score += sim * self.user_scores[user_][item] return score def recommend(self, user: str) -> Dict[str, float]: """Just give the scores on non-rated items of `user`, not sorted or top-k""" return { item: self.user_item_score(user, item) for item in self.user_non_score_items[user] } if __name__ == '__main__': Data = generate_score_data(100, 1000, 0.2, 0) # user similarity cost: 0.536 sec # recommend cost: 0.015 sec ub = UserCF(Data) with timer(name="User-based CF"): print(ub.recommend("C"))
def evaluation(self, k: int = 8, n_items: int = 10) -> Tuple[float, float]: """Compute precision and recall""" test_user_rated_items: Dict[str, Set[str]] = \ self.test_data.groupby("user").agg({"item": lambda s: set(list(s))})["item"].to_dict() hit = 0 test_num = 0 pred_num = 0 for test_user, test_items in tqdm(test_user_rated_items.items()): pred_items = self.recommend(test_user, k=k, n_items=n_items) test_num += len(test_items) pred_num += len(pred_items) for _, pred_item in pred_items: if pred_item in test_items: hit += 1 recall = hit / test_num precision = hit / pred_num return precision, recall if __name__ == '__main__': rec = ItemCFRecommend.from_file(os.path.join(MOVIE_LENS_SRC, "ratings.dat")) with timer("Recommend"): # 0.1356s result = rec.recommend("1", k=8, n_items=40) print(result) precision, recall = rec.evaluation(k=8, n_items=10) print(f"Precision: {precision}, Recall: {recall}.") # Precision: 0.188542, Recall: 0.11229.
return cls(eval_results) def evaluate(self, recommendation: FirstRec): """ Evaluate on recommendation object. :param recommendation: The recommend object, can return items for query user. :return: recall, precision """ print("Start evaluation.") recalls, precisions = [], [] for user, eval_entries in tqdm.tqdm(self.eval_results.items()): hit = 0 rec_results = recommendation.recommend(user) if not rec_results: print(f"No recommendation for {user}") continue # possibly because `user` not in `train_file` for movie, _ in rec_results: if movie in eval_entries: hit += 1 recalls.append(hit / len(eval_entries)) precisions.append(hit / len(rec_results)) return sum(recalls) / len(recalls), sum(precisions) / len(precisions) if __name__ == '__main__': rec = FirstRec.from_json_file(train_file, k=k, n=n) evaluation = Evaluation.from_json_file(test_file) with timer(name="Evaluation on `FirstRec`"): recall, precision = evaluation.evaluate(rec) print(f"Recall: {recall}, Precision: {precision}")
def user_item_score(self, user: str, item: str) -> float: """ Calculate the recommend score between specified user and item. For example, the score computation between user `C` and item `a` is as follows: Score(C, a) = sum([Sim(a, x) * Score(C, x) for x in items-rated-by-C]) """ score = 0.0 item_sim = self.items_sim[item] for item_, sim in item_sim.items(): score += sim * self.user_scores[user][item_] return score def recommend(self, user: str) -> Dict[str, float]: """Just give the scores on non-rated items of `user`, not sorted or top-k""" return { item: self.user_item_score(user, item) for item in self.user_non_score_items[user] } if __name__ == '__main__': Data = generate_score_data(100, 1000, 0.2, 0) # Data = Data # user similarity cost: 3.86 sec # recommend cost: 0.0429 sec ub = ItemCF(Data) with timer(name="Item-based CF"): print(ub.recommend("C"))
target_entries = self.features[target_user] # Select TopK neighbourhood's entries neighbour_users = PriorityQueue(maxsize=self.k) for user, entries in self.features.items(): if user == target_user: continue corr = pearson(entries, target_entries) neighbour_users.push( corr, entries) # different from source code, push entries not users. movies = defaultdict(float) for corr, entries in neighbour_users.queue(): for movie, rate in entries.items(): movies[movie] += corr * rate # corr as the weight of user # sort movies result = sorted(movies.items(), key=lambda k: k[1], reverse=True) return result[:self.n] if __name__ == '__main__': from _utils.context import timer json_file = os.path.join(os.path.dirname(__file__), "data/train.json") rec = FirstRec.from_json_file(json_file, k=15, n=20) with timer(name="Recommend Test"): # ~30 ms print(rec.recommend("436670"))
recall_artists = self.total_artists if recall_old else self.user_non_rated_artists[user] recall_artists_tag_gene = self.artist_tag_gene[recall_artists, :] # shape: [#recall_artists, max_tag_id + 1] scores = user_tag_preference.dot(recall_artists_tag_gene.T).toarray().reshape(-1) # shape: [#recall_artists, ] return heapq.nlargest(n_items, zip(recall_artists, scores), key=lambda pair: pair[1]) def evaluate(self, user: int) -> Tuple[float, float]: """Evaluate recommendation on specific user. :return: Tuple of precision and recall """ n_total_artists = len(self.total_artists) user_non_rated_artists = set(self.user_non_rated_artists[user]) true_num = n_total_artists - len(user_non_rated_artists) pred = self.recommend(user, n_items=true_num, recall_old=True) hit = 0 for artist, _ in pred: if artist not in user_non_rated_artists: hit += 1 return hit / len(pred), hit / true_num if __name__ == '__main__': rec = TagBasedRecommend(k=1.0) with timer("TagRecommend"): print(rec.recommend(2, 20, recall_old=False)) # 0.009s precision, recall = rec.evaluate(2) print(f"Precision: {precision:.4f}, Recall: {recall:.4f}") # 0.2, 0.2
a, b = pair low = min(a, b) if a % low == 0 and b % low == 0: return low start = low // 2 for i in range(start, 0, -1): if a % i == 0 and b % i == 0: return i numbers = [(1963309, 2265973), (1879675, 2493670), (2030677, 3814172), (1551645, 2229620), (1988912, 4736670), (2198964, 7876293)] if __name__ == '__main__': with _context.timer(name="Single Thread"): # 0.4558 result = list(map(gcd, numbers)) with _context.timer(name="Multi Thread"): # 0.4019 with ThreadPoolExecutor(max_workers=2) as pool: result = list(pool.map(gcd, numbers)) with _context.timer(name="Multi Process"): # 0.3790 with ProcessPoolExecutor(max_workers=2) as pool: result = list(pool.map(gcd, numbers)) # 多进程操作流程 # 1)把numbers列表中的每一项输入数据都传给map。 # 2)用pickle模块对数据进行序列化,将其变成二进制形式。 # 3)通过本地套接字,将序列化之后的数据从煮解释器所在的进程,发送到子解释器所在的进程。 # 4)在子进程中,用pickle对二进制数据进行反序列化,将其还原成python对象。
low = min(a, b) if a % low == 0 and b % low == 0: return low start = low // 2 for i in range(start, 0, -1): if a % i == 0 and b % i == 0: return i numbers = [(1963309, 2265973), (1879675, 2493670), (2030677, 3814172), (1551645, 2229620), (1988912, 4736670), (2198964, 7876293)] if __name__ == "__main__": """1. map(self, fn, *iterables, **kwargs)""" """返回的results列表是有序的,顺序和 `*iterables` 迭代器的顺序一致。""" with _context.timer("Map test"): with ProcessPoolExecutor(max_workers=2) as pool: results = list(pool.map(gcd, numbers)) print(results) """2. submit(self, fn, *args, **kwargs)""" """用于提交一个可并行的方法,submit方法同时返回一个future实例。""" """future对象标识这个线程/进程异步进行,并在未来的某个时间执行完成。future实例表示线程/进程状态的回调。""" with _context.timer("Submit test"): futures = [] with ProcessPoolExecutor(max_workers=2) as pool: for pair in numbers: future = pool.submit(gcd, pair) futures.append(future) results = [future.result() for future in futures] print(results) """3. future"""
"""Get not rated items for each user.""" rated_summary = rating.groupby("UserID").agg( {"MovieID": lambda s: set(s)}) rated_summary = dict(rated_summary["MovieID"]) return { user: total.difference(rated) for user, rated in rated_summary.items() } def recommend(self, user: int) -> List: """Recommend item which has not been rated by user and has biggest similarity with user's favor.""" LOGGER.info(f"Give recommendation for {user}.") user_vec = self.user_profile[user] result = PriorityQueue(self.k) non_rating_items = self.user_non_rating_items[user] LOGGER.info( f"Recommend from {len(non_rating_items)} / {len(self.total_items)} non-rated items for `{user}`" ) for movie in non_rating_items: # Not recommend rated movies item_vec = self.item_profile[movie] sim = 1 - cosine(item_vec, user_vec) result.push(sim, movie) return sorted(result.queue(), key=lambda k: k[0], reverse=True) if __name__ == '__main__': path = os.path.dirname(__file__) rec = ContentBasedRec.from_json_file(os.path.join(path, "data"), k=10) with timer("CBRecommend"): print(rec.recommend(1))
"""Compute precision and recall""" test_user_rated_items: Dict[str, Set[str]] =\ self.test_data.groupby("user").agg({"item": lambda s: set(list(s))})["item"].to_dict() hit = 0 test_num = 0 pred_num = 0 for test_user, test_items in tqdm(test_user_rated_items.items()): pred_items = self.recommend(test_user, k=k, n_items=n_items) test_num += len(test_items) pred_num += len(pred_items) for _, pred_item in pred_items: if pred_item in test_items: hit += 1 recall = hit / test_num precision = hit / pred_num return precision, recall if __name__ == '__main__': rec = UserCFRecommend.from_file(os.path.join(MOVIE_LENS_SRC, "ratings.dat")) with timer("Recommend"): # 0.006s result = rec.recommend("1", k=8, n_items=40) print(result) precision, recall = rec.evaluation(k=8, n_items=10) print(f"Precision: {precision}, Recall: {recall}." ) # Precision: 0.17529, Recall: 0.10440.