Exemplo n.º 1
0
    def test_get_data_directory(self):
        answer_data_dir = zhihu_util.get_data_directory("answer")
        self.assertTrue(os.path.exists(answer_data_dir))
        self.assertTrue("sda/data/zhihu/answer" in answer_data_dir)

        question_data_dir = zhihu_util.get_data_directory("question")
        self.assertTrue(os.path.exists(question_data_dir))

        self.assertTrue("sda/data/zhihu/question" in question_data_dir)
Exemplo n.º 2
0
def flush_buffer(write_buffer, suffix, ts, thread_index, mode="finish"):
    print "...write buffer into disk..."
    data_dir = zhihu_util.get_data_directory("user")
    buffer_filename = "%s/%s%s-%s-%s" % (data_dir, suffix, USER_FILE_DELIMITER, int(ts),
                                         thread_index)
    if mode == "doing":
        buffer_filename += ".tmp"
    zhihu_util.write_buffer_file(write_buffer, buffer_filename, USER_FIELD_DELIMITER)
Exemplo n.º 3
0
def generate_write_bloomfilter(dir_name, capacity=1000000, error_rate=0.01):
    bf = BloomFilter(capacity, error_rate)
    data_dir = zhihu_util.get_data_directory(dir_name)
    data_file_list = zhihu_util.get_file_list(data_dir)
    for data_file in data_file_list:
        # read url_suffix from data file
        with open(data_file, "r") as file_object:
            for line in file_object:
                url_suffix = line.split(USER_FIELD_DELIMITER)[0]
                if url_suffix.strip() != '':
                    # print "......url suffix:%s added into bloom filter" % url_suffix
                    bf.add(str(url_suffix))
    return bf
Exemplo n.º 4
0
def init_user_access():
    data_dir = zhihu_util.get_data_directory("user")
    filenames = zhihu_util.get_file_names(data_dir)
    result_list = [filename.split(USER_FILE_DELIMITER)[0] for filename in filenames]
    return set(result_list)