class Preprocessor(object): def __init__(self): self.memory = Memory() self.mysql_db = MysqlDB() @synchronized def load_all_feeds(self): """所有视频载入内存""" all_poems = self.mysql_db.get_all_poems() # 2s all_poets = self.mysql_db.get_all_poets() # 2s assert len(all_poems) > 0 self.memory.all_poems_dict = { _poem.id: Poem(**_poem.to_dict()) for _poem in all_poems } # 1.5s self.memory.all_poets_dict = { _poet.id: Poet(**_poet.to_dict()) for _poet in all_poets } # 1.5s self.memory.all_poem_ids = list(self.memory.all_poems_dict.keys()) self.memory.popular_poem_ids = self.get_popular_poem_ids(1000) log_str = '*** load {} feeds to memory'.format( len(self.memory.all_poems_dict)) print(log_str) logging.info(log_str) def get_popular_poem_ids(self, num): """视频按照流行程度popular_index排序载入内存,0.023""" poem_stars = [ self.memory.all_poems_dict[poem_id].star for poem_id in self.memory.all_poem_ids ] ordered_top_n_star = largest_indices(np.asarray(poem_stars), min(num, len(poem_stars))) popular_poem_ids = [ self.memory.all_poem_ids[index] for index in ordered_top_n_star ] return popular_poem_ids @synchronized def task(self, app): app.app_context().push() while True: self.load_all_feeds() time.sleep(5 * 60) def run(self, app): from threading import Thread t = Thread(target=self.task, args=[app]) t.start()
class TagTask(object): def __init__(self): self.mysql_db = MysqlDB() self.memory = Memory() self.load_all_poems() def load_all_poems(self): all_poems = self.mysql_db.get_all_poems() # 2s all_poets = self.mysql_db.get_all_poets() # 2s assert len(all_poems) > 0 self.memory.all_poems_dict = { _poem.id: Poem(**_poem.to_dict()) for _poem in all_poems } # 1.5s self.memory.all_poets_dict = { _poet.id: Poet(**_poet.to_dict()) for _poet in all_poets } # 1.5s self.memory.all_poem_ids = list(self.memory.all_poems_dict.keys()) def create_fake_history(self): """创建一批假的观看记录""" random.seed(100) user_count = 100 preference_count = 30 tangshi = random.sample(range(user_count), preference_count) # 喜欢唐诗 songci = random.sample(range(user_count), preference_count) # 喜欢宋词 yuanqu = random.sample(range(user_count), preference_count) # 喜欢元曲 shijing = random.sample(range(user_count), preference_count) # 喜欢诗经 chuci = random.sample(range(user_count), preference_count) # 喜欢楚辞 # yuefu = random.sample(range(user_count), preference_count) # 喜欢乐府 # minyao = random.sample(range(user_count), preference_count) # 喜欢民谣 # guwenguanzhi = random.sample(range(user_count), preference_count) # 喜欢古文观止 data = { "user_id": [], "poem_id": [], # user info # "province": [], "city": [], 体现在 region_tag "age": [], "gender": [], "device_id": [], # user context "weather_tag": [], "wind_tag": [], "temperature_tag": [], "time_tag": [], "season_tag": [], "festival_tag": [], "region_tag": [], # 评分 "star": [], # poem context } for user_id in range(0, 100): # 10000个用户 record = {} record["user_id"] = user_id record["age"] = random.randint(10, 70) record["gender"] = random.choice(['男', '女']) record["device_id"] = random.choice(['Android', 'iPhone']) record["weather_tag"] = random.choice(weather_tags) record["wind_tag"] = random.choice( [random.choice(wind_tags), "未知"]) record["temperature_tag"] = random.choice(temperature_tags) record["time_tag"] = random.choice(time_tags) record["season_tag"] = random.choice(season_tags) record["festival_tag"] = random.choice( [random.choice(festival_tags), "无"]) record["region_tag"] = random.choice(region_tags) user_tags = set(record.values()) for _ in range(0, random.randint(0, 100)): # 每个用户评价100首诗 record["poem_id"] = random.choice(self.memory.all_poem_ids) poem = self.memory.all_poems_dict[record["poem_id"]] record["star"] = len(user_tags & poem.tags) if user_id in tangshi and "唐诗" in poem.tags: record["star"] += 5 elif user_id in songci and "宋词" in poem.tags: record["star"] += 5 elif user_id in yuanqu and "元曲" in poem.tags: record["star"] += 5 elif user_id in shijing and "诗经" in poem.tags: record["star"] += 5 elif user_id in chuci and "楚辞" in poem.tags: record["star"] += 5 if record["age"] <= 22 and '爱情' in poem.tags: record["star"] += 1 elif record["age"] > 40 and {'怀古', '重阳', '抒情', '思念' } & poem.tags: record["star"] += 1 if record["gender"] == '男' and {'豪放', '战争', '励志'} & poem.tags: record["star"] += 1 elif record["gender"] == '女' and {'婉约', '闺怨', '读书' } & poem.tags: record["star"] += 1 for k, v in record.items(): data[k].append(v) # 若包含则 star = 1 , 不包含则 star = 0 df_data = pd.DataFrame(data=data) df_data[:int(0.7 * len(df_data))].to_csv(path_or_buf=os.path.join( wd_data_dir, "test.txt"), sep=",", index=False) df_data[int(0.7 * len(df_data)):].to_csv(path_or_buf=os.path.join( wd_data_dir, "train.txt"), sep=",", index=False) df_data.to_csv(path_or_buf="history.csv", sep=",", index=False) def tag_poems(self): """给诗词打标签""" poems = self.mysql_db.session.query(Poem).all() count = 0 for poem in poems: _tags = [] if poem.poet_id in self.memory.all_poets_dict: poet_city = self.memory.all_poets_dict[poem.poet_id].city if poet_city: _tags.append(poet_city) content = poem.about + poem.fanyi + poem.shangxi + poem.content for tag in all_tags: if tag in content: _tags.append(tag) # print(f"poem_name:{poem.name},poem.tags:{poem.tags},_tags:{_tags}") # if _tags: poem_tags = poem.tags.split(",") if poem.tags else [] # print(poem.name, poem_tags, _tags) tags = list( set([ _tag for _tag in poem_tags + _tags if _tag not in ("", "无") ])) poem.tags = ",".join(tags) # 修改记录 # self.mysql_db.session.commit() # 提交修改 count += 1 if count % 1000 == 0: self.mysql_db.session.commit() print(count) self.mysql_db.session.commit() print(count) def tag_poets(self): """给诗人打标签""" def run(self): self.tag_poems()