Пример #1
0
def seg_line(line):
    '''
        给每一行的文本分词
        '''
    line = line.rstrip()  #去掉每一行的换行符
    words = segmentor.segment(line)  #分词
    postags = postagger.postag(words)  #词性标注
    Pos_Filter = PosFilter(words, postags)  #新建一个词性过滤器对指定的词性的单词进行过滤
    words_filter = Pos_Filter.filter_words()  #过滤单词
    rm_stop_word = StopWord.filter_words(words_filter)  #去掉停用词
    join_word = [ele for ele in rm_stop_word if not ele.isalpha()]
    join_word = ' '.join(join_word)  #用空格连接分词结果
    return join_word
Пример #2
0
def seg_line(line, segmentor, postagger):
    '''
        给每一句话的处理
        '''
    line = line.rstrip()  #去掉每一行的换行符
    words = segmentor.segment(line)  #分词
    postags = postagger.postag(words)  #词性标注
    Pos_Filter = PosFilter(words, postags)  #新建一个词性过滤器对指定的词性的单词进行过滤
    words_filter = Pos_Filter.filter_words()  #过滤单词
    rm_stop_word = StopWord.filter_words(words_filter)  #去掉停用词
    join_word = [ele for ele in rm_stop_word]  #去掉英文单词
    #join_word = [ele for ele in join_word if len(ele)>3] #去掉英文单词
    return join_word