def get_word_term_dic(self): lines = tool.get_file_lines("./data/word_term_dic.txt") for line in lines: temp = line.split("@@@@") t = Term(int(temp[0]), int(temp[1]), int(temp[2])) t.append_location([int(x) for x in temp[3].split("##")]) self.word_term_dic[int( temp[0])] = self.word_term_dic.get(int(temp[0]), []) + [t]
def get_word_index_dic(self): lines = tool.get_file_lines("./data/word_index_dic.txt") for line in lines: temp = line.split("\t") try: self.word_index_dic[temp[0].decode("utf-8")] = int(temp[1]) except: continue
def get_lasted_doc_id(): """ 获取最新的文档编号 :return: """ lines = tool.get_file_lines("./data/doc.txt") try: lasted_id = int(lines[-1].split("@@@@")[0]) except IndexError or TypeError: lasted_id = 0 return lasted_id
def get_word_freq_dic(self): lines = tool.get_file_lines("./data/word_freq_dic.txt") for line in lines: temp = line.split("\t") self.word_freq_dic[int(temp[0])] = int(temp[1])
def get_doc_dic(self): lines = tool.get_file_lines("./data/doc.txt") for line in lines: temp = line.split("@@@@") self.doc_dic[int(temp[0])] = temp[1]