def make_general_vocab(vocab_size=4096, min_freq=2, overwrite=False): if os.path.exists(CONFIG.vocab_file) and not overwrite: return read_lines(CONFIG.vocab_file, lambda x: x.split()[0]) vocab_counter = defaultdict(int) # def must_include_words(words): # for word in words: # vocab_counter[word] = 999999 # targets = read_lines(CONFIG.target_word_list, lambda x: x.strip()) # descriptions = read_lines(CONFIG.description_word_list, lambda x: x.strip()) # must_include_words(targets + descriptions + ['UNK']) def update_with_cut_reviews(reviews): for review in reviews: for sentence in review: for word in sentence.split(): vocab_counter[word] += 1 comments = load_json_file(CONFIG.single_rate_comment_cut) update_with_cut_reviews(comments) reviews = load_json_file(CONFIG.single_rate_review_cut) update_with_cut_reviews(reviews) items = list(filter(lambda x: x[1] >= min_freq, vocab_counter.items())) items.sort(key=lambda x: x[1], reverse=True) items = items[:vocab_size] write_lines(CONFIG.vocab_file, items, lambda x: '%s %d' % (x[0], x[1]))
def make_vocab_lookup(vocab_file, reverse=False, unk_token=None): words = read_lines(vocab_file, lambda x: x.strip().split()[0]) if unk_token is not None and unk_token not in words: words.insert(0, unk_token) words = list(filter(lambda x: x != '', words)) if reverse: # id2word lookup = dict([(i, x.strip()) for i, x in enumerate(words)]) else: # word2id lookup = dict([(x.strip(), i) for i, x in enumerate(words)]) return lookup
def make_train_test_set(test_set_id=0, model_type=None, random_shuflle=True, random_seed=0): train_files = [ x for x in os.listdir(CONFIG.training_folder) if re.match(r'%s_\d+.txt' % CONFIG.rate_record_file_name, x) is not None ] train_files.sort() test_file = '%s_%d.txt' % (CONFIG.rate_record_file_name, test_set_id) if test_file not in train_files: print('error test_set_id! No such file %s' % test_file) test_file = train_files[0] print('use %s instead' % test_file) train_files.remove(test_file) train_files = [ os.path.join(CONFIG.training_folder, x) for x in train_files ] test_file = os.path.join(CONFIG.training_folder, test_file) testdata = read_lines(test_file, lambda x: x.split()) test_input = [(int(x[0]), int(x[1])) for x in testdata] test_labels = [int(x[2]) for x in testdata] traindata = [] for train_file in train_files: traindata.extend(read_lines(train_file, lambda x: x.split())) traindata = list( map(lambda x: (int(x[0]), int(x[1]), int(x[2])), traindata)) if random_shuflle: random.Random(random_seed).shuffle(traindata) if model_type in {'UserCF', 'ItemCF', 'LFM'}: dict_train_data = defaultdict(dict) for user, item, rate in traindata: dict_train_data[user][item] = rate traindata = dict_train_data return traindata, test_input, test_labels
def split_dataset(k=5, overwrite=False): if not overwrite: files = [ x for x in os.listdir(CONFIG.training_folder) if re.match(r'%s_\d+.txt' % CONFIG.rate_record_file_name, x) is not None ] if len(files) == k: return all_records = read_lines(CONFIG.rate_record_all) for i in range(k): out_file = os.path.join( CONFIG.training_folder, '%s_%d.txt' % (CONFIG.rate_record_file_name, i)) subset = all_records[i::k] write_lines(out_file, subset, lambda x: x.strip())
def make_tags(overwrite=False): def parse_movie_tags(movie): info_file = os.path.join(CONFIG.movie_path, movie, 'info.json') if not os.path.exists(info_file): return [] info = load_json_file(info_file) return info.get("genres", []) def parse_user_tags(user): collect_profile_file = os.path.join(CONFIG.user_path, user, 'profile', 'collect_distribution.json') if not os.path.exists(collect_profile_file): return [] collect_profile = load_json_file(collect_profile_file) tag_distribution = collect_profile.get("type", {}) tags = list( itertools.chain.from_iterable( [[tag for _ in range(freq)] for tag, freq in tag_distribution.items()])) return tags if not overwrite and os.path.exists(CONFIG.user_tags_file): user_tags = load_json_file(CONFIG.user_tags_file) else: users = load_np_array(CONFIG.user_list_file) user_tags = list(map(parse_user_tags, users)) save_json_file(CONFIG.user_tags_file, user_tags) if not overwrite and os.path.exists(CONFIG.movie_tags_file): movie_tags = load_json_file(CONFIG.movie_tags_file) else: movies = load_np_array(CONFIG.movie_list_file) movie_tags = list(map(parse_movie_tags, movies)) save_json_file(CONFIG.movie_tags_file, movie_tags) if not overwrite and os.path.exists(CONFIG.tag_word_list): tag_words = read_lines(CONFIG.tag_word_list, lambda x: x.strip()) else: tag_words = set(itertools.chain.from_iterable(user_tags + movie_tags)) write_lines(CONFIG.tag_word_list, tag_words) return user_tags, movie_tags, tag_words