def wordvector(self): np.set_printoptions(threshold=np.NaN) #1.读取停用词 stpwrdlst = utils.readfile(self.stopword_path, encoding='UTF-8').splitlines() #2.导入分词后的词向量bunch对象 bunch = utils.readbunchobj(self.bunch_path) #3.构建TF-IDF词向量空间对象 # vocabulary:词汇表 tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) #4.用TfidfVetorizer初始化向量空间模型 #max_df:这个给定特征可以应用在 tf-idf 矩阵中,用以描述单词在文档中的最高出现率。假设一个词(term)在 80% 的文档中都出现过了,那它也许(在剧情简介的语境里)只携带非常少信息。 #min_df:可以是一个整数(例如5)。意味着单词必须在 5 个以上的文档中出现才会被纳入考虑。设置为 0.2;即单词至少在 20% 的文档中出现 。 # vectorizer1 = TfidfVectorizer(stop_words=stpwrdlst, # sublinear_tf=True) vectorizer = CountVectorizer(stop_words=stpwrdlst) #文本转换为词频矩阵,单独保存字典文字 transformer = TfidfTransformer() #tfidfspace其实是tf-idf权重 tfidfspace.tdm = transformer.fit_transform( vectorizer.fit_transform(bunch.contents)) # print(vectorizer.fit_transform(bunch.contents)) # print(np.array(tfidfspace.tdm.toarray())) tfidfspace.vocabulary = vectorizer.vocabulary_ #5.创建持久化词袋 writebunchobj(self.space_path, tfidfspace) print("创建tf-idf词袋成功")
def wordcut(self): catelist = os.listdir(self.corpus_path) for mydir in catelist: class_path = self.corpus_path + mydir + "/" seg_dir = self.seg_path + mydir + "/" if not os.path.exists(seg_dir): os.makedirs(seg_dir) file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path content = utils.readfile(fullname).strip() content = removeHTML(content) content = content.replace("\r\n", "").strip() content_seg = jieba.cut(content) utils.savefile(seg_dir + file_path, " ".join(content_seg)) print('中文预料分词结束:', self.corpus_path)
def fastpredict(self, content, stopword_path, train_space_path): #1.读取停用词 stpwrdlst = readfile(stopword_path).splitlines() #2.content转换成词向量 content = removeHTML(content) content = content.replace("\r\n", "").strip() content_seg = jieba.cut(content) #3.导入训练集词袋 trainbunch = readbunchobj(train_space_path) vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=1, min_df=1, vocabulary=trainbunch.vocabulary) tdm = vectorizer.fit_transform([" ".join(content_seg)]) #4.预测,alpha越小,迭代次数越多,精度越高 clf = MultinomialNB(alpha=0.001).fit(trainbunch.tdm, trainbunch.label) predicted1 = clf.predict_proba(tdm) predicted2 = clf.predict(tdm) print(predicted1, predicted2) return predicted1, predicted2
def wordbunch(self): #Bunch类提供一种key,value的对象形式,每个key的含义: #target_name :所有分类集名称列表 #label:每个文件的分类标签列表 #filenames:文件路径 #contents:分词后文件词向量形式 bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) catelist = os.listdir(self.seg_path) bunch.target_name.extend(catelist) for mydir in catelist: class_path = self.seg_path + mydir + '/' file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path bunch.label.append(mydir) #保存当前文件的标签 bunch.filenames.append(fullname) #保存当前文件路径 print(fullname) bunch.contents.append(utils.readfile(fullname).strip()) if not os.path.exists(self.wordbag_path): os.makedirs(self.wordbag_path) file_obj = open(self.wordbag_file, 'wb') pickle.dump(bunch, file_obj) file_obj.close() print("构建文本对象结束:", self.seg_path)
def wordvector(self): #1.读取停用词 stpwrdlst = readfile(self.stopword_path).splitlines() #2.导入分词后的词向量bunch对象 bunch = readbunchobj(self.bunch_path) #3.构建TF-IDF词向量空间对象 # vocabulary:词汇表 testspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) #4.导入训练集词袋 trainbunch = readbunchobj(self.train_space_path) #5.用TfidfVetorizer初始化向量空间模型 #max_df:这个给定特征可以应用在 tf-idf 矩阵中,用以描述单词在文档中的最高出现率。假设一个词(term)在 80% 的文档中都出现过了,那它也许(在剧情简介的语境里)只携带非常少信息。 #min_df:可以是一个整数(例如5)。意味着单词必须在 5 个以上的文档中出现才会被纳入考虑。设置为 0.2;即单词至少在 20% 的文档中出现 。 # vectorizer = TfidfVectorizer(stop_words=stpwrdlst, # sublinear_tf=True,max_df=1,min_df=1, # vocabulary=trainbunch.vocabulary) vectorizer = CountVectorizer(stop_words=stpwrdlst, vocabulary=trainbunch.vocabulary) transformer = TfidfTransformer() #文本转换为词频矩阵,单独保存字典文字 testspace.tdm = transformer.fit_transform( vectorizer.fit_transform(bunch.contents)) testspace.vocabulary = vectorizer.vocabulary_ #5.创建持久化词袋 space_path = self.test_space_path writebunchobj(space_path, testspace) print("创建test词袋成功")
def fastpredict(): p = Predict() content = utils.readfile('fastpredict/content.txt') stopword_path = "hlt_stop_words.txt" train_space_path = 'train_word_bag/tfidfspace.dat' return p.fastpredict(content, stopword_path, train_space_path)