def NLPIRCutWithPos(self, isClearText=False): if isClearText == False: self.__strWithPos = list(nlpir.seg(self.__text)) else: self.__strWithPos = list(nlpir.seg(self.clearText())) return self.__strWithPos
def parse_sentence(self): feature_attr_list = ["a", "v", "z", "d", "e"] feature_word_count = 0 feature_word_num = 3 token_list = nlp.seg(self.sentence) for token in token_list: # 获取整个句子每个单词的正向、0、负向之和 if token[0] in self.mdictionary: mword = self.mdictionary.get(token[0]) self.positive_word_count += mword.positive_count self.negative_word_count += mword.netative_count self.zero_word_count += mword.zero_count # 如果单词有特征属性,则加入句子的单词列表 if mword.attr[0] in feature_attr_list and feature_word_count < feature_word_num: self.word_list.append(mword) feature_word_count += 1 # 如果特征单词不足三个 if feature_word_count < feature_word_num: # 放入句子前两个 if len(token_list) > 2: for i in range(feature_word_num - len(self.word_list)): self.word_list.append(self.mdictionary[token_list[i]]) else: for i in range(feature_word_num - len(self.word_list)): self.word_list.append(self.mdictionary[token_list[0]])
def ChineseWordsSegmentationByNLPIR2016(text): txt = nlpir.seg(text) seg_list = [] for t in txt: seg_list.append(t[0].encode('utf-8')) return seg_list
def NLPIRCutText(self, isAddWord=False): if isAddWord == True: for i in self.__newWords: nlpir.AddUserWord(i) for i in self.__userWords: nlpir.AddUserWord(i) txt = nlpir.seg(self.__text) self.__seg_list = [] for t in txt: self.__seg_list.append(t[0].encode('utf-8')) return ' '.join(self.__seg_list)
def parse_file(self): """ 把训练文件的句子分解成词语,并打上标签 :return: """ f = open(self.file_path, "r") line = f.readline() label = self.get_label(line) text = self.get_text(line) for token in nlp.seg(text): if self.word_filter(token[1]): continue if token[0] in self.mdict.keys(): mword = self.mdict.get(token[0]) inc_operator = { "+1": mword.inc_positive_count, "0": mword.inc_zero_count, "-1": mword.inc_negative_count, } inc_operator.get(label)() else: mword = MyWord(token[0], token[1]) self.mdict[token[0]] = mword
posstr = cutstrpos(filestr2) print type(posstr) # print filestr print '**** show is end ****' print ' ' print 'This is posster' print posstr strtag = [nltk.tag.str2tuple(word) for word in posstr.split()] # for item in strtag: # print item strsBySeg = nlpir.seg(filestr) strsBySeg2 = nlpir.seg(filestr2) strsByParagraphProcess = nlpir.ParagraphProcess(filestr, 1) strsByParagraphProcessA = nlpir.ParagraphProcessA( filestr, ChineseWordsSegmentationByNLPIR2016(filestr)[0], 1) print ' ' print ' ' print '**** strtag ****' for word, tag in strtag: print word, "/", tag, "|", print ' ' print ' '