def corpus_segment(corpus_path, seg_path): catelist = os.listdir( corpus_path) # Gets all subdirectories under corpus_path ## In fact, the name of subdirectories is the category #print("Segmenting..Please wait.") # Gets all the files under each directory (category) for mydir in catelist: class_path = corpus_path + mydir + "/" seg_dir = seg_path + mydir + "/" if not os.path.exists( seg_dir ): # Whether there is a word segmentation directory, if not, create it os.makedirs(seg_dir) file_list = os.listdir( class_path ) # Get all the text in a category in an unsegmented term repository # Traverse all files in the category directory and to process for file_path in file_list: fullname = class_path + file_path content = readfile(fullname) content = content.replace( '\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # Delete line breaks content = content.replace( ' '.encode('utf-8'), ''.encode('utf-8')).strip() # Delete empty lines, extra spaces content_seg = jieba.cut(content) # segment savefile(seg_dir + file_path, ' '.join(content_seg).encode( 'utf-8')) # Save the segmented file
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None): stpwrdlst = readfile(stopword_path).splitlines() bunch = readbunchobj(bunch_path) tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) if train_tfidf_path is not None: trainbunch = readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) else: vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!!!")
def corpus_segment(corpus_path, seg_path): catelist = os.listdir(corpus_path) # 获取corpus_path下的所有子目录 print("玩儿命分词中...") # 获取每个目录(类别)下所有的文件 for mydir in catelist: class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径 seg_dir = seg_path + mydir + "/" # 拼出分词后存贮的对应目录路径 if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录 os.makedirs(seg_dir) file_list = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本 for file_path in file_list: # 遍历类别目录下的所有文件 fullname = class_path + file_path # 拼出文件名全路径如:train_corpus/art/21.txt content = readfile(fullname) # 读取文件内容 content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行 content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格 content_seg = jieba.cut(content) # 为文件内容分词 savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录 print("中文语料分词结束!!!")
def tf_idf(bunch): stpwrdlst = readfile("stopwords.txt").splitlines() tfidfspace = Bunch(target_name=bunch.target_name, tdm=[], vocabulary={}) trainbunch = readbunchobj("train_word_bag/tfdifspace.dat") tfidfspace.vocabulary = trainbunch.vocabulary vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary) print(bunch.contents) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) return tfidfspace.tdm
def bunch2Space(bunch): stopword_path = "train_word_bag/hlt_stop_words.txt" stpwrdlst = readfile(stopword_path).splitlines() tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ return tfidfspace
def corpus2Bunch(filename): wordbag_path = "upload_word_bag/" + filename + "_set.dat" # Bunch存储路径 seg_path = "upload_corpus_seg/" + filename + ".txt" # 分词后分类语料库路径 # 创建一个Bunch实例 bunch = Bunch(filenames=[seg_path], contents=[readfile(seg_path)]) ''' extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充 原来的list ''' # 将bunch存储到wordbag_path路径中 with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("构建文本对象结束!!!")
def corpus_segment(corpus_path, seg_path): ''' corpus_path是未分词语料库路径 seg_path是分词后语料库存储路径 ''' catelist = os.listdir(corpus_path) # 获取corpus_path下的所有子目录 ''' 其中子目录的名字就是类别名 train_corpus/it/21.txt中,'train_corpus/'是corpus_path,'it'是catelist中的一个成员 ''' print("玩儿命分词中...") # 获取每个目录(类别)下所有的文件 for mydir in catelist: ''' 这里mydir就是train_corpus/it/21.txt中的it(即catelist中的一个类别) ''' class_path = corpus_path + mydir + "/" # 拼出分类子目录的路径如:train_corpus/it/ seg_dir = seg_path + mydir + "/" # 拼出分词后存贮的对应目录路径如:train_corpus_seg/it/ if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录 os.makedirs(seg_dir) file_list = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本 ''' train_corpus/it/中的 21.txt, 22.txt, 23.txt ... file_list=['21.txt','22.txt',...] ''' for file_path in file_list: # 遍历类别目录下的所有文件 fullname = class_path + file_path # 拼出文件名全路径如:train_corpus/it/21.txt content = readfile(fullname) # 读取文件内容 '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 ''' content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行 content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格 content_seg = jieba.cut(content) # 为文件内容分词 savefile( seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录 print("中文语料分词结束!!!")
def corpus2Bunchtest(wordbag_path, seg_path): catelist = os.listdir(seg_path) # 获取seg_path下的所有子目录,也就是分类信息 # 创建一个Bunch实例 bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) # 获取每个目录下所有的文件 for file_path in catelist: # 遍历类别目录下文件 fullname = seg_path + file_path # 拼出文件名全路径 id = file_path[:-4] bunch.filenames.append(id) bunch.contents.append(readfile(fullname)) # 读取文件内容 # 将bunch存储到wordbag_path路径中 with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("构建文本对象结束!!!")
def vector_space(filename): stopword_path = "upload_word_bag/hlt_stop_words.txt" bunch_path = "upload_word_bag/" + filename + "_set.dat" space_path = "upload_word_bag/" + filename + "space.dat" train_tfidf_path = "train_word_bag/tfdifspace.dat" stpwrdlst = readfile(stopword_path).splitlines() bunch = readbunchobj(bunch_path) tfidfspace = Bunch(filenames=bunch.filenames, tdm=[], vocabulary={}) trainbunch = readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!!!")
def corpus_segment(filename): # 对上传文件进行分词 corpus_path = "./upload_corpus/" + filename + ".txt" # 未分词分类语料库路径 seg_path = "./upload_corpus_seg/" + filename + ".txt" # 分词后分类语料库路径 ''' corpus_path是未分词语料库路径 seg_path是分词后语料库存储路径 ''' content = readfile(corpus_path) # 读取文件内容 '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 ''' content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # 删除换行 content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip() # 删除空行、多余的空格 content_seg = jieba.cut(content) # 为文件内容分词 savefile(seg_path, ' '.join(content_seg).encode('utf-8')) # 将处理后的文件保存到分词后语料目录
def corpus2Bunch(wordbag_path, seg_path): catelist = os.listdir(seg_path) bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend( catelist) # Expand the original list with the new list (addlist) # Add an element to the original list for mydir in catelist: class_path = seg_path + mydir + "/" file_list = os.listdir(class_path) for file_path in file_list: fullname = class_path + file_path bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(readfile(fullname)) # Store bunch in wordbag_path with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj)
def corpus2Bunch(wordbag_path, seg_path): catelist = os.listdir(seg_path) # get the category # create a bunch bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) # obtain the file under path for mydir in catelist: class_path = seg_path + mydir + "/" # give the full path file_list = os.listdir(class_path) # get all files under class_path for file_path in file_list: # visit all the files under path fullname = class_path + file_path bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(readfile(fullname)) #read the txt # store bunch into the wordbag_path with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("the construction of text object is finished!!!")
def corpus_segment(corpus_path, seg_path): ''' corpus_path is the path for file before division seg_path is the path for file after division ''' catelist = os.listdir(corpus_path) ''' catelist record all the folder names in the corpus_path, including 'art','literature','education'... ''' print("the jieba is working") # to obtain the file under each folder for mydir in catelist: class_path = corpus_path + mydir + "/" # train_corpus/art/ seg_dir = seg_path + mydir + "/" # train_corpus_seg/art/ if not os.path.exists(seg_dir): # create the train_corpus_seg os.makedirs(seg_dir) file_list = os.listdir(class_path) for file_path in file_list: # visit all the file under file_list fullname = class_path + file_path # give the full path:train_corpus/art/21.txt content = readfile(fullname) #read the .txt file '''delete the white space,null string,return ''' content = content.replace( '\r\n'.encode('utf-8'), ''.encode('utf-8')).strip() # delete return content = content.replace( ' '.encode('utf-8'), ''.encode('utf-8')).strip() # delete white space content_seg = jieba.cut(content) # 为文件内容分词 savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8')) # put the file after division into seg_path print("the division of sentences is finished!!!")
def corpus2Bunch(wordbag_path, seg_path): catelist = os.listdir(seg_path) # 获取seg_path下的所有子目录,也就是分类信息 # 创建一个Bunch实例 bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) ''' extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充 原来的list ''' # 获取每个目录下所有的文件 for mydir in catelist: class_path = seg_path + mydir + "/" # 拼出分类子目录的路径 file_list = os.listdir(class_path) # 获取class_path下的所有文件 for file_path in file_list: # 遍历类别目录下文件 fullname = class_path + file_path # 拼出文件名全路径 bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(readfile(fullname)) # 读取文件内容 '''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别''' # 将bunch存储到wordbag_path路径中 with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("构建文本对象结束!!!")
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None): # 读取停用词 stpwrdlst = readfile(stopword_path).splitlines() bunch = readbunchobj(bunch_path) # 导入分词后的词向量bunch对象 # 构建tf-idf词向量空间对象 tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) ''' 在前面几节中,我们已经介绍了Bunch。 target_name,label和filenames这几个成员都是我们自己定义的玩意儿,前面已经讲过不再赘述。 下面我们讲一下tdm和vocabulary(这俩玩意儿也都是我们自己创建的): tdm存放的是计算后得到的TF-IDF权重矩阵。 请记住,我们后面分类器需要的东西,其实就是训练集的tdm和标签label,因此这个成员是 很重要的。 vocabulary是词典索引,例如 vocabulary={"我":0,"喜欢":1,"相国大人":2},这里的数字对应的就是tdm矩阵的列 我们现在就是要构建一个词向量空间,因此在初始时刻,这个tdm和vocabulary自然都是空的。 如果你在这一步将vocabulary赋值了一个 自定义的内容,那么,你是傻逼。 ''' ''' 与下面这2行代码等价的代码是: vectorizer=CountVectorizer()#构建一个计算词频(TF)的玩意儿,当然这里面不只是可以做这些 transformer=TfidfTransformer()#构建一个计算TF-IDF的玩意儿 tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) #vectorizer.fit_transform(corpus)将文本corpus输入,得到词频矩阵 #将这个矩阵作为输入,用transformer.fit_transform(词频矩阵)得到TF-IDF权重矩阵 看名字你也应该知道: Tfidf-Transformer + Count-Vectorizer = Tfidf-Vectorizer 下面的代码一步到位,把上面的两个步骤一次性全部完成 值得注意的是,CountVectorizer()和TfidfVectorizer()里面都有一个成员叫做vocabulary_(后面带一个下划线) 这个成员的意义,与我们之前在构建Bunch对象时提到的自己定义的那个vocabulary的意思是一样的, 只不过一个是私有成员,一个是外部输入,原则上应该保持一致。 显然,我们在第45行中创建tfidfspace中定义的vocabulary就应该被赋值为这个vocabulary_ ''' # 构建一个快乐地一步到位的玩意儿,专业一点儿叫做:使用TfidfVectorizer初始化向量空间模型 # 这里面有TF-IDF权重矩阵还有我们要的词向量空间坐标轴信息vocabulary_ if train_tfidf_path is not None: # 导入训练集的TF-IDF词向量空间 trainbunch = readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) else: vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) # 此时tdm里面存储的就是if-idf权值矩阵 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ ''' 关于参数,你只需要了解这么几个就可以了: stop_words: 传入停用词,以后我们获得vocabulary_的时候,就会根据文本信息去掉停用词得到 vocabulary: 之前说过,不再解释。 sublinear_tf: 计算tf值采用亚线性策略。比如,我们以前算tf是词频,现在用1+log(tf)来充当词频。 smooth_idf: 计算idf的时候log(分子/分母)分母有可能是0,smooth_idf会采用log(分子/(1+分母))的方式解决。 默认已经开启,无需关心。 norm: 归一化,我们计算TF-IDF的时候,是用TF*IDF,TF可以是归一化的, 也可以是没有归一化的,一般都是采用归一化的方法,默认开启. max_df: 有些词,他们的文档频率太高了(一个词如果每篇文档都出现,那还有必要用它来区分文本类别吗?当然不用了呀), 所以,我们可以 设定一个阈值,比如float类型0.5(取值范围[0.0,1.0]), 表示这个词如果在整个数据集中超过50%的文本都出现了,那么我们也把它列 为临时停用词。 当然你也可以设定为int型,例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了, 那么我们也把它列为临时停用词。 min_df: 与max_df相反,虽然文档频率越低,似乎越能区分文本,可是如果太低, 例如10000篇文本中只有1篇文本出现过这个词,仅仅因为这1篇 文本,就增加了词向量空间的维度,太不划算。 当然,max_df和min_df在给定vocabulary参数时,就失效了。 ''' writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!!!")