def vectorize_files(fileroot,savepath): data = ftools.read_dir_lines_dict(fileroot) auto = AutoCoder() count = 0 print(len(data.keys())) for key in data.keys(): text = '。'.join(data[key]) sens, sens_words, sens_tags = auto.preprocess(text) start = time.time() sens_vector,essay_vector = auto.vectorize(sens_words, sens_tags) end = time.time() key_text =''.join([''.join(var) for var in sens_words]) save_key = tools.md5(key_text) tmp =[list(var) for var in sens_vector] save_object = [tmp,list(essay_vector)] tools.save_object(save_object,Dir.res + "/encoder/cleandata_8700/"+save_key) count+=1 print(count,len(data.keys()),end-start)
def vectorize(self,sens_words,sens_tags): key_text = ''.join([''.join(var) for var in sens_words]) key = tools.md5(key_text) # print(key) if key in self.data.keys(): tmp = self.data[key] else: print("trainning") tmp0,tmp1 = self.auto.vectorize(sens_words,sens_tags) tmp = [tmp0,tmp1] tmpsens = [list(var) for var in tmp0] save_object = [tmpsens, list(tmp1)] save_key = tools.md5(key_text) tools.save_object(save_object, Dir.res+"/encoder/cleandata_8700/" + save_key) # print(type(tmp)) # print(len(tmp)) return tmp[0],tmp[1]