def segment(sentence, is_cut2char=False, enable_pos=False): """ 切词 :param sentence: :param is_cut2char: False use jieba.lcut; True use list(sentence) :param enable_pos: bool, enable POS :return: list """ import jieba from jieba import posseg jieba.setLogLevel(log_level="ERROR") if enable_pos: if not is_cut2char: word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq else: word_seq = list(sentence) pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) return word_seq, pos_seq else: if not is_cut2char: return jieba.lcut(sentence) else: return list(sentence)
def segment(sentence, cut_type='word', pos=False): """ 切词 :param sentence: :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) :param pos: enable POS :return: list """ if pos: if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq elif cut_type == 'char': word_seq = list(sentence) pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) return word_seq, pos_seq else: if cut_type == 'word': return jieba.lcut(sentence) elif cut_type == 'char': return list(sentence)
def segment(sentence, cut_type='word', pos=False, None_flag='O'): """ 切词 :param sentence: :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) :param pos: enable POS :param None_flag: 'BIO' the 'O' :return: list """ import logging jieba.default_logger.setLevel(logging.ERROR) if pos: if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq elif cut_type == 'char': word_seq = list(sentence) pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) return word_seq, pos_seq else: if cut_type == 'word': return jieba.lcut(sentence) elif cut_type == 'char': return list(sentence)
def SegJieba(InfoGene): (name, yxsj, fj, yxzd) = next(InfoGene) print(name) keywords = get_key(yxsj) yxsj_vec = re.split(r'[,。;]+', yxsj) # 也可以自定义user-dict word_list = jieba.lcut(yxsj) freq_dist = nltk.FreqDist(word_list) print(freq_dist) for i in freq_dist: print(i) jieba.add_word(word="两肺", freq=None, tag='n') jieba.add_word(word="支气管壁", freq=None, tag='n') jieba.add_word(word="左肺", freq=None, tag='n') clinic_dict = {} discrip = '' for sent in yxsj_vec: print([(x.word, x.flag) for x in psg.lcut(sent)]) for sent in yxsj_vec: for x in psg.lcut(sent): if x.word in keywords and x.flag == 'n': key = x.word discrip = clinic_dict.get(key, "") if x.word in keywords and (x.flag == 'a' or x.flag == 'v'): discrip = discrip + x.word clinic_dict[key] = discrip if discrip != "": print(key, clinic_dict[key])
def segment(sentence,cut_type='word',pos=False): """ :param sentence: 后面要把数据集拼接为句子 :param cut_type: 粒度:word和char级别 :param pos: 是否标注词性,默认不标注 :return: """ if pos:#整体上从是否支持词性标注进行判断 if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) ''' 注意cut和lcut的区别: cut是一个生成器(generator),需要通过for循环取其中的每一个词 而lcut直接生成一个list ''' word_seq,pos_seq = [],[] #定义两个list,第一个用于存放所分出的词,第二个用于存放其词性(如果pos=Ture) for w,p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq,pos_seq elif cut_type == 'char': word_seq = list(sentence) pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) #这里取pair中0号位置的flag(词性) return word_seq,pos_seq else: if cut_type == 'word': return jieba.lcut(sentence) elif cut_type == 'char':#由于粒度是char,可以直接用list返回 return list(sentence)
def segment(sentence, cut_type='word', pos=False): seg_words = [] seg_pos = [] if cut_type == 'word': if pos == True: seg_word_pos = posseg.lcut(sentence) for word, pos in seg_word_pos: seg_words.append(word) seg_pos.append(pos) return seg_words, seg_pos elif pos == False: seg_words = jieba.lcut(sentence) return seg_words if cut_type == 'char': if pos == True: for char in sentence: seg_word_pos = posseg.lcut(char) for word, pos in seg_word_pos: seg_words.append(word) seg_pos.append(pos) return seg_words, seg_pos elif pos == False: for char in sentence: seg_words.append(char) return seg_words
def getCosinSimilarity(self, str1, str2): soupfcontent1 = BeautifulSoup(str(str1), "html.parser") content_table1 = soupfcontent1.find_all('table') soupfcontent2 = BeautifulSoup(str(str2), "html.parser") content_table2 = soupfcontent2.find_all('table') cut_str1 = [ w for w, t in posseg.lcut(str(content_table1)) if 'n' in t or 'v' in t ] cut_str2 = [ w for w, t in posseg.lcut(str(content_table2)) if 'n' in t or 'v' in t ] # 列出所有词 all_words = set(cut_str1 + cut_str2) # 计算词频 freq_str1 = [cut_str1.count(x) for x in all_words] freq_str2 = [cut_str2.count(x) for x in all_words] # 计算相似度 sum_all = sum(map(lambda z, y: z * y, freq_str1, freq_str2)) sqrt_str1 = math.sqrt(sum(x**2 for x in freq_str1)) sqrt_str2 = math.sqrt(sum(x**2 for x in freq_str2)) cosin_similarity = sum_all / (sqrt_str1 * sqrt_str2) print cosin_similarity
def loadDocument(stopList): global docList docList = [] for file in os.listdir(negPath): news = None with open(os.path.join(negPath,file),'r',encoding='utf-8') as f: news = f.read() noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] news = list(jieba.cut(news)) news = [word for word in news if (word not in stopList) and (word not in noun)] # 过滤停用词和名词 docList.append(news) for file in os.listdir(neuPath): news = None with open(os.path.join(neuPath,file),'r',encoding='utf-8') as f: news = f.read() noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] news = list(jieba.cut(news)) news = [word for word in news if (word not in stopList) and (word not in noun)] # 过滤停用词和名词 docList.append(news) for file in os.listdir(posPath): news = None with open(os.path.join(posPath,file),'r',encoding='utf-8') as f: news = f.read() noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] news = list(jieba.cut(news)) news = [word for word in news if (word not in stopList) and (word not in noun)] # 过滤停用词和名词 docList.append(news) return None
def segment(sentence: str, cut_type: str = 'word', pos: bool = False) -> list: """ 对句子进行分词操作。 :param sentence: 需要分词的句子 :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) 分词方式 :param pos: enable POS 是否启用POS(词性标注) :return: list 分词后所有词组成的列表 """ if pos: if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq elif cut_type == 'char': # 按字符分隔 word_seq = list(sentence) # 把句子拆分为单个字组成的列表 pos_seq = [] for w in word_seq: w_p = posseg.lcut(w) pos_seq.append(w_p[0].flag) # 获取对应的词性 pos return word_seq, pos_seq else: if cut_type == 'word': # 按词分隔 return jieba.lcut(sentence) elif cut_type == 'char': return list(sentence)
def segment(sentence, cut_type='word', pos=False): """ 切词 :param sentence: :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence) :param pos: enable POS :return: list """ import logging jieba.default_logger.setLevel(logging.ERROR) if pos: if cut_type == 'word': word_pos_seq = posseg.lcut(sentence) word_seq, pos_seq = [], [] for w, p in word_pos_seq: word_seq.append(w) pos_seq.append(p) return word_seq, pos_seq elif cut_type == 'char': word_seq = list(sentence) pos_seq = [] for i in word_seq: w_p = posseg.lcut(i) pos_seq.append(w_p[0].flag) return word_seq, pos_seq else: if cut_type == 'word': return jieba.lcut(sentence) elif cut_type == 'char': return list(sentence)
def _cut_jieba(row): """ cut the sentences into tokens :param row: :return: """ cut_words = [] cut_flags = [] if '。' in row: row = row.split('。') for idx, s in enumerate(row): if idx != len(row) - 1: s = s + '。' s_cut = list(pseg.lcut(s, HMM=False)) cut_words.extend([c.word for c in s_cut]) cut_flags.extend([c.flag for c in s_cut]) else: s_cut = list(pseg.lcut(row, HMM=False)) cut_words = [c.word for c in s_cut] cut_flags = [c.flag for c in s_cut] new_row = pd.Series() new_row['tokens'] = cut_words new_row['flags'] = cut_flags return new_row
def loadWords(stopList): global wordsList wordsSet = set() for file in os.listdir(negPath): news = None with open(os.path.join(negPath,file),'r',encoding='utf-8',errors='ignore') as f: news = f.read() noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表 news = set(jieba.cut(news)) news = {word for word in news if (word not in stopList) and (word not in noun)} # 过滤停用词和名词 wordsSet = news | wordsSet # 取集合并集 # 最后要使用list类型,因为要保证结果的有序性 wordsList = list(wordsSet) for file in os.listdir(neuPath): news = None with open(os.path.join(neuPath,file),'r',encoding='utf-8',errors='ignore') as f: news = f.read() noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表 news = set(jieba.cut(news)) news = {word for word in news if (word not in stopList) and (word not in noun)} # 过滤停用词和名词 wordsSet = news | wordsSet # 取集合并集 # 最后要使用list类型,因为要保证结果的有序性 wordsList = list(wordsSet) for file in os.listdir(posPath): news = None with open(os.path.join(posPath,file),'r',encoding='utf-8',errors='ignore') as f: news = f.read() noun = [word for word, flag in pseg.lcut(news) if flag.startswith('n')] # 拿到其中的名词列表 news = set(jieba.cut(news)) news = {word for word in news if (word not in stopList) and (word not in noun)} # 过滤停用词和名词 wordsSet = news | wordsSet # 取集合并集 # 最后要使用list类型,因为要保证结果的有序性 wordsList = list(wordsSet) return None
def tokenizer(self, text): if type(text) is list: result = list() for s in text: result.append(pseg.lcut(s)) return result else: return pseg.lcut(text)
def simicos(str1, str2): cut_str1 = [w for w, t in posseg.lcut(str1) if t != 'x'] cut_str2 = [w for w, t in posseg.lcut(str2) if t != 'x'] if cut_str1 != [] and cut_str2 != []: all_words = set(cut_str1 + cut_str2) freq_str1 = [cut_str1.count(x) for x in all_words] freq_str2 = [cut_str2.count(x) for x in all_words] sum_all = sum(map(lambda z, y: z * y, freq_str1, freq_str2)) sqrt_str1 = math.sqrt(sum(x ** 2 for x in freq_str1)) sqrt_str2 = math.sqrt(sum(x ** 2 for x in freq_str2)) return sum_all / (sqrt_str1 * sqrt_str2) else: return 0
def process_postag(text): word, pos = [], [] for w, p in posseg.lcut(text): word += [w] * len(w) pos += [p] * len(w) return word, pos
def delNOTNeedWords(content,customstopwords=None): # words = jieba.lcut(content) if customstopwords == None: customstopwords = "stopwords.txt" import os if os.path.exists(customstopwords): stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n') customstopwords = stop_words result='' return_words = [] # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') tempword = word.encode('utf-8').strip(' ') if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']): # and flag[0] in [u'n', u'f', u'a', u'z']): # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等 result += tempword # +"/"+str(w.flag)+" " #去停用词 return_words.append(tempword) return result,return_words
def split_words(s): # 繁体转简体 s = SplitWords.__convert(s) # 去除标签 s = SplitWords.__del_non_tag(s) # 去除标点符号 s = SplitWords.__del_punctuation(s) # 去除数字 s = SplitWords.__del_digit(s) # 分词 带有词性 words = pseg.lcut(s, HMM=True) # 重新编码 UTF-8 words = SplitWords.__reencoding(words) # 去掉中文停用词 words = SplitWords.__del_stop(words, SplitWords.__read_chinese_stoplist()) # 去掉英文停用词 words = SplitWords.__del_stop(words, SplitWords.__read_english_stoplist()) # 去掉多余的空格 words = SplitWords.__del_blank(words) # 去掉无用的词性词汇,并将剩下的词汇的词性删除 words = SplitWords.__del_non_pos(words) return words
def parse_by_rules(self, text): self.words = pseg.lcut(parse_cn_number(text), HMM=False) while self.has_next(): beginning = self.get_index() self.consume_year_period() \ or self.consume_month_period() \ or self.consume_day_period() self.consume_weekday_period() \ or self.consume_hour_period() \ or self.consume_minute_period() self.consume_year() \ or self.consume_month() \ or self.consume_day() self.consume_hour() if self.get_index() != beginning: # Time found self.consume_word(u'准时') if self.consume_word(u'提醒'): self.consume_word(u'我') if self.current_tag() == 'v' and self.peek_next_word() == u'我': self.advance(2) self.consume_to_end() return Remind(time=self.now, desc=text, event=self.do_what) else: self.advance() return None
def maxSimTxt(self, intxt, simCondision=0.15, simType='simple'): ''' 找出知识库里的和输入句子相似度最高的句子 :param intxt: 输入文本 :param simCondision: 相似度阈值 :param simType: :return: ''' self.lastTxt.append(intxt) if simType not in ('simple', 'simple_pos', 'vec'): return 'error: maxSimTxt的simType类型不存在: {}'.format(simType) # 如果没有加载词向量,那么降级成 simple_pos 方法 embedding = self.vecModel if simType == 'vec' and not embedding: simType = 'simple_pos' for t in self.zhishiku: questions = t.q_vec if simType == 'vec' else t.q_word in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut( intxt) t.sim = max( similarity( in_vec, question, method=simType, embedding=embedding) for question in questions) maxSim = max(self.zhishiku, key=lambda x: x.sim) logger.info('maxSim=' + format(maxSim.sim, '.0%')) if maxSim.sim < simCondision: return [''], '' return maxSim.q, maxSim.a
def _cut_word(self, comment): #分词 word_pairs = posseg.lcut(comment, HMM=False) result = [] for t in word_pairs: if not (t.word in result or t.word in self.stop_words): result.append(t.word) return '/'.join(result)
def participle(self, raw_sentence): """对原始语句分词,去标点,返回两个列表,第一个为分词结果,第二个为词性列表""" m = [] n = [] # 年龄处理 age_list = re.findall("\d+岁.*?月|\d+岁半|\d+岁|\d+年级|[一二三四五六七八九]年级", raw_sentence) # 日期时间处理 time_list = re.findall("\d+号上午\d+点|\d+号下午\d+点|\d+号上午|\d+号下午|\d+号晚上|\d+号|\d+[::]\d+", raw_sentence) total = age_list + time_list for i in total: jieba.add_word(i) for i, j in pseg.lcut(raw_sentence): # 去标点 if i not in self.stopwords: m.append(i) n.append(j) # 把地址合在一起,例如将['北京市','海淀区','西土城路']合称为'北京市海淀区西土城路' index = [] for i in range(len(n)): if n[i] == 'ns': index.append(i) if len(index) > 1: for i in range(index[-1]-index[0]): m[index[0]] += m[index[0]+i+1] m[index[0]+i+1] = '' n[index[0]+i+1] = '' x, y = [], [] for i in m: if i != '': x.append(i) for i in n: if i != '': y.append(i) else: x, y = m, n return x, y
def delNOTNeedWords(content, customstopwords=None): # words = jieba.lcut(content) if customstopwords == None: import os file_stop_words = "stopwords.txt" if os.path.exists(file_stop_words): stop_words = codecs.open(file_stop_words, encoding="UTF-8").read() customstopwords = stop_words result = "" return_words = [] # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') if word not in customstopwords and flag[0]: # in [u'n', u'f', u'a', u'v', u'd',u'z']): # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等 result += word.encode("utf-8") # +"/"+str(w.flag)+" " #去停用词 return_words.append(word.encode("utf-8")) return result, return_words
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'): """ 找出知识库里的和输入句子相似度最高的句子 simType=simple, simple_POS, vec """ self.lastTxt.append(intxt) if simType not in ('simple', 'simple_pos', 'vec'): return 'error: maxSimTxt的simType类型不存在: {}'.format(simType) # 如果没有加载词向量,那么降级成 simple_pos 方法 embedding = self.vecModel if simType == 'vec' and not embedding: simType = 'simple_pos' for t in self.zhishiku: questions = t.q_vec if simType == 'vec' else t.q_word in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut( intxt) t.sim = max( similarity( in_vec, question, method=simType, embedding=embedding) for question in questions) maxSim = max(self.zhishiku, key=lambda x: x.sim) logger.info('maxSim=' + format(maxSim.sim, '.0%')) if maxSim.sim < simCondision: return '抱歉,我没有理解您的意思。请您询问有关汽车的话题。' return maxSim.a
async def _(session: NLPSession): # 去掉消息首尾的空白符 stripped_msg = session.msg_text.strip() # 对消息进行分词和词性标注 words = posseg.lcut(stripped_msg) dates = {'今天': 0, '明天': 1, '后天': 2 } searchInfo={'city': None, 'date': None} # 遍历 posseg.lcut 返回的列表 for word in words: # 每个元素是一个 pair 对象,包含 word 和 flag 两个属性,分别表示词和词性 if word.flag == 'ns' and searchInfo['city'] is None: # ns 词性表示地名 searchInfo['city'] = word.word if word.flag == 't' and searchInfo['date'] is None: if word.word in dates: searchInfo['date'] = dates[word.word] if (not searchInfo['city'] is None) and ( not searchInfo['date'] is None): break if searchInfo['date'] is None: searchInfo['date'] =0 # 返回意图命令,前两个参数必填,分别表示置信度和意图命令名 return IntentCommand(80.0, 'weather', current_arg=json.dumps(searchInfo) or '')
def multilingual_sent_split(texts): print('\nOriginal texts: ', texts) lingual_split_sign = {'x', 'eng'} final_parts = [] sub_part = [] cuts = pseg.lcut(texts) for idx in range(len(cuts) - 1): # 如果当前位置的词语词性和下一个词词性相同,则把当前位置上的词添加进当前的sub_part中 if (cuts[idx].flag in lingual_split_sign and cuts[idx + 1].flag in lingual_split_sign) or ( cuts[idx].flag not in lingual_split_sign and cuts[idx + 1].flag not in lingual_split_sign): sub_part.append(cuts[idx].word) # 否则就应该把当前的sub_part添加进final_parts中,且要新建sub_part else: sub_part.append(cuts[idx].word) final_parts.append(sub_part) sub_part = [] # 最后一个词如果和倒数第二个词词性相同,则把最后一个词添加进当前的sub_part中 if (cuts[-1].flag in lingual_split_sign and cuts[-2].flag in lingual_split_sign) or ( cuts[-1].flag not in lingual_split_sign and cuts[-2].flag not in lingual_split_sign): sub_part.append(cuts[-1].word) # 最后一个词如果和倒数第二个词词性不相同,则把最后一个词作为新的sub_part添加进final_parts中 else: final_parts.append([cuts[-1].word]) if sub_part: final_parts.append(sub_part) final_strs = [''.join(_l) for _l in final_parts] print('Cut texts: ', final_strs) return final_strs
def filter_stop_words(content): result = [] # 最终返回结果 words = pseg.lcut(content) # 分词 for word in words: if word.word.strip() not in stop_words and word.flag[0] in [u'n']: result.append(word.word.strip().encode('utf-8')) return result
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True): """ segment input file to output file :param in_file: :param out_file: :param word_sep: :param pos_sep: :param is_pos: 需要词性标注 :return: """ jieba.enable_parallel() with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout: count = 0 for line in fin: in_line = line.strip() seg_line = '' if is_pos: words = posseg.lcut(in_line) for word, pos in words: seg_line += word + pos_sep + pos + word_sep else: words = jieba.lcut(in_line) for word in words: seg_line += word + word_sep fout.write(seg_line + "\n") count += 1 print("segment ok. input file count:", count)
def extract_keyword(self, text, number): """ 抽取关键词 :param text: 输入文本 :param number: 抽取的关键词数量 :return: 返回重要成都排前number的关键词 """ graph = TextRank() occu2num = defaultdict(int) seg_list = psg.lcut(text) for i, pair in enumerate(seg_list): if pair.flag[0] in self.tag and len(pair.word) > 1: for j in range(i + 1, i + 1 + self.span): if j >= len(seg_list): break if seg_list[j].flag[0] not in self.tag or len(seg_list[j].word) < 2: continue if (seg_list[j].word, pair.word) in occu2num: occu2num[(seg_list[j].word, pair.word)] += 1 else: occu2num[(pair.word, seg_list[j].word)] += 1 for key, value in occu2num.items(): graph.add_edge(key[0], value, key[1]) node_rank = graph.build_rank() node_rank = sorted(node_rank.items(), key=lambda x: x[1], reverse=True) return node_rank[:number]
def analyse_wordVector(model, name_list, sentence): for name in name_list: print('{}的词向量为:\n{}'.format(name, model[name])) print('与{}最相关的词:{}'.format(name, model.most_similar(name))) topn = 3 # 查看跟'令狐冲'相关性前三的词 print('跟{}相关性前{}的词:\n{}'.format(name, topn, model.similar_by_word(name, topn=topn))) print('跟{}关系相当于师妹跟林平之的关系的词:\n{}'.format( name, model.most_similar(['师妹', '林平之'], [name], topn=topn))) print('跟{}关系相当于师妹跟圣姑的关系的词:\n{}'.format( name, model.most_similar(['师妹', '圣姑'], [name], topn=topn))) #u"令狐冲 任盈盈 林平之 岳不群 东方不败" a, b = '令狐冲', '师妹' print('集合{}中不同类的词语:{}'.format( name_list, model.wv.doesnt_match(u"令狐冲 任盈盈 林平之 岳不群 东方不败".split()))) # 选出集合中不同类的词语 print('{}和{}之间的相关度:{}'.format(a, b, model.wv.similarity(a, b))) # 两个词语之间的相关度 #分词后对词的属性进行分析 sentence = poss.lcut(sentence) # cut()分词,返回一个生成器generator,可通过迭代的方法访问各个分词 # lcut()返回的是list,list(jieba.cut())等价与jieba.lcut() print(sentence) # nr:人名 r:代词 v:动词 print('测试句子中的人名有:', [list(i)[0] for i in sentence if list(i)[1] == 'nr']) # ['林平之']
def delNOTNeedWords(content, customstopwords=None): # words = jieba.lcut(content) if customstopwords == None: customstopwords = "stopwords.txt" import os if os.path.exists(customstopwords): stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n') customstopwords = stop_words result = '' return_words = [] # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') tempword = word.encode('utf-8').strip(' ') if (word not in customstopwords and len(tempword) > 0 and flag in [ u'n', u'nr', u'ns', u'nt', u'nz', u'ng', u't', u'tg', u'f', u'v', u'vd', u'vn', u'vf', u'vx', u'vi', u'vl', u'vg', u'a', u'an', u'ag', u'al', u'm', u'mq', u'o', u'x' ]): # and flag[0] in [u'n', u'f', u'a', u'z']): # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等 result += tempword # +"/"+str(w.flag)+" " #去停用词 return_words.append(tempword) return result, return_words
def cut_word(raw_data): cut_result = [] prbl_cl = list(raw_data) for entry in prbl_cl: word_cut = psg.lcut(entry) cut_result += word_cut return cut_result
def cut_sentence(self, sentence, by_word=True, use_stop=True, with_sg=False): """ with_sg : 有无词性 """ assert by_word != True or with_sg != True, '单个字分词没有词性' if by_word is True: return self._cut_by_word(sentence) else: jb_content = psg.lcut(sentence) if use_stop is True: # 判断是否存在停用词 jb_content = [ i for i in jb_content if i.word not in self.stop_word ] if with_sg is True: jb_content = [(i.word, i.flag) for i in jb_content] else: jb_content = [i.word for i in jb_content] return jb_content
def string2json(text, save_path=None): # input: text is a string # output: sample is a json, save sample to a json file and return True assert isinstance(save_path, str) # generate sentence and eligible entities lst_text = pseg.lcut(text) sentence = list() lst_entity = list() for i, j in lst_text: if ('n' in j) or (j in ['i', 'j', 's', 'l']): # 名词,成语、习语、空间词、临时语,也包含未知词"un" lst_entity.append(i) sentence.append(i) sentence = " ".join(sentence) lst_entity = list(set(lst_entity)) # generate sample with json structure sample = list() for head, tail in itertools.combinations(lst_entity, 2): # 候选词两两组合 d = { "sentence": sentence, "head": {"word": str(head), "id": str(head)}, "tail": {"word": str(tail), "id": str(tail)}, "relation": ""} sample.append(d) # 对称 d = { "sentence": sentence, "head": {"word": str(tail), "id": str(tail)}, "tail": {"word": str(head), "id": str(head)}, "relation": ""} sample.append(d) # save sample with open(save_path, "w") as f: json.dump(sample, f) return True
def abstract_question(self, question): """ 使用jieba进行分词,将关键词进行词性抽象 :param question: :RETURN: """ self.abstractMap = {} list_word = pseg.lcut(question) # 中文分词 abstractQuery = '' nr_count = 0 for item in list_word: word = item.word pos = str(item) if 'nm' in pos: # 电影名 abstractQuery += "nm " self.abstractMap['nm'] = word elif 'nr' in pos and nr_count == 0: abstractQuery += 'nnt ' self.abstractMap['nnt'] = word nr_count += 1 elif 'nr' in pos and nr_count == 1: # nr再一次出现,改成nnr abstractQuery += "nnr " self.abstractMap['nnr'] = word nr_count += 1 elif 'x' in pos: abstractQuery += "x " self.abstractMap['x'] = word else: abstractQuery += word + " " return abstractQuery
def get_question_by_rowQ1(rowQ): # print(rowQ) # print(rowQ) line = p.sub("",rowQ[-1]).replace("\n","") # line = rowQ[-2] + p.sub("",line).replace("\n","") pos_list = posseg.lcut(line) question = "" if(rule_1(line,pos_list,rowQ) != None): question = rule_1(line,pos_list,rowQ) elif(rule_2(line,pos_list,rowQ) != None): question = rule_2(line,pos_list,rowQ) else: # question = line # print(rowQ) # print("x") question = rule_3(line,pos_list,rowQ) question = question.lower() if(question.find(rowQ[0].lower()) == -1): question = rowQ[0].lower()+" "+question # print(question) # print("~~~"*30) return question
def pseg_adj_n(row): pair_pseg = [item for item in pseg.lcut(row[3])] adj = [[word, flag][0] for word, flag in pair_pseg if flag[0] == "a"] n = [[word, flag][0] for word, flag in pair_pseg if flag[0] == "n"] dataset = [adj, n] return dataset
def delstopwords(content): result = '' words = pseg.lcut("".join(content.split())) for word, flag in words: if word not in stopwords and flag not in ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz", "/y"]: # 去停用词和其他词性,比如非名词动词等 result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 return result
def readfile(filename): jieba.load_userdict('../dict.txt') # 导入自定义词典 tips:更改词频 file = open(filename) content = file.readline() parts = content.split(' ') res = pseg.lcut(parts[2]) # 返回列表形式 # for i in range(1, len(res)): # print (res[i]).word for word, flag in res: if flag == 'nr': print word
def cut_with_flag(raw_str, filter_invalid_word_flag=True): """ :param raw_str: str :return: list[(str, str)] """ res = [(a, b) for a, b in pseg.lcut(raw_str)] if filter_invalid_word_flag: return filter_invalid_word(res) else: return res
def ranking_function(output_prob_tree, cx, cy): # 平仄 x_py = pypinyin.pinyin(cx, style=pypinyin.TONE2) y_py = pypinyin.pinyin(cy, style=pypinyin.TONE2) x_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, x_py) y_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, y_py) pingze_score = sum(map(lambda i, j: i + j == 0, x_pz, y_pz)) / float(len(cx)) + 0.001 def sigmoid(x): return 1 / (1 + math.e ** (-x)) def pos_eq(x_pos, y_pos): return x_pos == y_pos or x_pos in y_pos or y_pos in x_pos import operator smooth_value = 0.001 freq_amp = 10 ** math.sqrt(len(cx)) # 词性 cx_pos = map(lambda x: zip(*pseg.lcut(x)[0])[0][1], cx) cy_pos = map(lambda y: zip(*pseg.lcut(y)[0])[0][1], cy) pos_score = reduce(operator.add, map(lambda x, y: float(1)/len(cx) if pos_eq(x, y) else 0, cx_pos, cy_pos)) pos_score += smooth_value # 输出概率 out_score = reduce(operator.mul, map(lambda x, y: output_prob_tree[x][y] * freq_amp, cx, cy)) out_score = sigmoid(out_score) out_score += smooth_value # 整合 score = pingze_score * out_score * pos_score # score = pingze_score * pos_score # print 'ranking', cy # print 'pingze', pingze_score # print 'pos', pos_score # print 'freq', out_score return score
def delstopwords(content): # words = jieba.lcut(content) result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) for word, flag in words: if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等 result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 # print result return result
def on_post(self, req, resp): body = req.stream.read() if not body: raise falcon.HTTPBadRequest("Empty request body") # seg_list = list(jieba.cut(body, cut_all=False)) words = pseg.lcut(body) result = list() for word, flag in words: tmp = posSeg.posSeg(word, flag) result.append(tmp.__dict__) resp.body = json.dumps(result) resp.status = falcon.HTTP_200
def sentence_to_vector(sentence, padding=True, padding_length=10): ''' 将句子的每个词转换成词向量, 如果出现word2vec 的OOV词,则随机初始化; 如果padding 为True的话,会将句子进行补全,不够长度则补0,超出长度则截断 :param sentence: input sentence :type sentence: list :param padding: bool :param padding_length: :type padding_length: int :return: :rtype:np.array ''' vectors = [] for item in sentence: # print item try: vector = word2vec_model[unicode(item)] except: logging.debug(u'出现未知词(%s),随机填充' % (item)) vector = get_unkown_vector(50) # print vector vectors.append(vector) if padding: if len(vectors) > padding_length: logging.debug(u'对句子进行截断:%s' % (' '.join(sentence))) seg_index = range(len(vectors)) # print seg_index # 对代词进行过滤 counter = 0 for item in posseg.lcut(' '.join(sentence)): # print counter if item.flag == 'x': # seg_index.remove(index) continue if item.flag == 'r': logging.debug('去除:%s' % (item)) seg_index.remove(counter) if len(seg_index) == padding_length: break counter += 1 vectors = np.asarray(vectors)[seg_index][:padding_length] sentence = np.asarray(sentence)[seg_index][:padding_length] logging.debug(u'对句子进行截断后:%s' % (' '.join(sentence))) elif len(vectors) < padding_length: vectors.extend([get_None_vector(50)] * (padding_length - len(vectors))) return np.asarray(vectors)
def delNOTNeedWords(content,stopwords): # words = jieba.lcut(content) result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') if (word not in stopwords and flag[0] in [u'n',u'f',u'a',u'z']): #去停用词和其他词性,比如非名词动词等 result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 return result
async def _(session: NLPSession): # 去掉消息首尾的空白符 stripped_msg_text = session.msg_text.strip() # 对消息进行分词和词性标注 words = posseg.lcut(stripped_msg_text) city = None # 遍历 posseg.lcut 返回的列表 for word in words: # 每个元素是一个 pair 对象,包含 word 和 flag 两个属性,分别表示词和词性 if word.flag == 'ns': # ns 词性表示地名 city = word.word # 返回处理结果,三个参数分别为置信度、命令名、命令会话的参数 return NLPResult(90.0, 'weather', {'city': city})
def delNOTNeedWords(content,stopwords): # words = jieba.lcut(content) result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) # jieba.cut() text_list = [] for word, flag in words: # print word.encode('utf-8') if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #去停用词和其他词性,比如非名词动词等 # text_list.append(word.encode('utf-8')) result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 # ''.join(text_list) return result
def seg(sentence,sep='|',full_mode = True,remove_stopword = False): ''' 使用jieba分词进行分词 :param sentence: 待分词句子 :type sentence: str :param remove_stopword: 是否去除stopword :type remove_stopword: bool :return:返回分词后字符串,seg_srt :rtype: str ''' # logging.debug('是否去除stopwords:%s'%remove_stopword) # for items in jseg.lcut(sentence): # print items.flag,items.word seg = [] pattern = re.compile('[0-9]+$') for items in jseg.lcut(sentence): # 利用词性标注去除标点符号 if items.flag == 'x': logging.debug(u'句子(%s)将标点符号:"%s"替换成""'%(sentence,items.word)) seg.append('') # continue if remove_stopword and items.word in stopword_list: logging.debug(u'句子(%s)去除stopwords:%s' % (sentence,items)) continue # 将数字替换成 NUM if pattern.match(items.word) and items.word not in exclude_word_list: print items seg.append('DIGITTAG') logging.debug(u'句子(%s)将数字:"%s" 替换成标记:"DIGITTAG"'%(sentence,items.word)) else: seg.append(items.word) # sentence = [items.word for items in jseg.lcut(sentence) if items.flag!='x'] sentence = ' '.join(seg) # print sentence # print sentence seg_list = jieba.lcut(sentence, cut_all=full_mode) # print seg_list seg_list = [item for item in seg_list if len(item.strip())!=0] # print seg_list seg_srt = sep.join(seg_list) return seg_srt
def delstopwords(content): stopwords = codecs.open('stopwords.txt', encoding='UTF-8').read() stopwordSet = set(stopwords.split('\n')) # words = jieba.lcut(content) result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 # v 动词 # # vd 副动词 # # vn 名动词 # # vshi 动词“是” # # vyou 动词“有” # # vf 趋向动词 # # vx 形式动词 # # vi 不及物动词(内动词) # # vl 动词性惯用语 # # vg 动词性语素 words = pseg.lcut(content) for word, flag in words: if (word not in stopwordSet and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y","/v","/vd","/vn","/vshi","/vyou","/v","/vf","/vx","/vi","/vl","/vg"]): #去停用词和其他词性,比如非名词动词等 # if (word not in stopwords and flag in ["/n","/a","/d"]): #去停用词和其他词性,比如非名词动词等 result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 # print result return result
def parse_by_rules(self, text): self.words = pseg.lcut(parse_cn_number(text), HMM=False) while self.has_next(): beginning = self.get_index() self.consume_repeat() self.consume_year_period() \ or self.consume_month_period() \ or self.consume_day_period() self.consume_weekday_period() \ or self.consume_hour_period() \ or self.consume_minute_period() \ or self.consume_second_period() self.consume_year() \ or self.consume_month() \ or self.consume_day() self.consume_hour() if self.get_index() != beginning: # Time found self.consume_word(u'准时') if self.consume_word(u'提醒'): self.consume_word(u'我') if self.current_tag() == 'v' and self.peek_next_word() == u'我': self.advance(2) self.consume_to_end() # Donot set event to None,since serializer will just skip None and we will have no chance to modify it remind = Remind(time=self.now, repeat=self.repeat, desc=text, event=self.do_what) remind.reschedule() return remind else: self.advance() return None
def seg(self, sentence, sep=' ', full_mode=False, remove_stopword=False, replace_number=False, lowercase=True, zhs2zht=True, remove_url=True, HMM=False, ): """ 使用 jieba 分词进行分词 :param sentence: 待分词句子 :type sentence: str :param sep: 将句子分完词之后使用什么字符连接,默认以空格连接. :type sep: str :param full_mode: jieba设置选项,是否使用full mode分词模式. :type full_mode: bool :param remove_stopword: 是否去除 stop word :type remove_stopword: bool :param replace_number: 是否把数字统一替换成字符 NUM :type replace_number: bool :param lowercase: 是否把字母转成小写 :type lowercase: bool :param zhs2zht: 出現繁体的時候,是否转简体 :type zhs2zht: bool :param remove_url: 是否移除 微博url,包含t.cn的url,比如:http://t.cn/开头的地址或者//t.cn/R50TdMg :type remove_url: bool :param HMM: 是否启用HMM发现新词模式,默认为False :type HMM: bool :return: 返回分词后字符串,seg_srt :rtype: str """ # 先去除所有空格 sentence = sentence.replace(' ', '') if lowercase: # 转成小写 sentence = sentence.lower() if zhs2zht: # 繁体转简体 sentence = self.convert_to_simple_chinese(sentence) if remove_url: # sentence = re.sub(u'(http:)//t.cn/[a-zA-Z0-9]*$', '', sentence) sentence = re.sub(u'(http:|)//t.cn/[a-zA-Z0-9]+', '', sentence) # 数字对模式匹配 num_pattern = re.compile('[0-9][0-9\.]*$') words = [] for item in jieba.lcut(sentence, HMM=False): if num_pattern.match(item): # 匹配上数字 if not replace_number: words.append(item) elif item not in self.exclude_word_list: word = num_pattern.sub('NUMBER', item) words.append(word) if self.verbose > 1: logging.debug(u'句子(%s)将数字:"%s" 替换成标记:"NUMBER"' % (sentence, item)) print(u'句子(%s)将数字:"%s" 替换成标记:"NUMBER"' % (sentence, item)) else: words.append(item) elif remove_stopword and item in self.stopword_list: # 移除 stop words if self.verbose > 1: logging.debug(u'句子(%s)去除stopwords:%s' % (sentence, item)) else: # 其他词如果词性是 x, 则识别到标点符号 is_x = False for word, pos in jseg.lcut(item, HMM=HMM): # print word,pos if pos in ['x']: is_x = True # words.append(word) if is_x: # 标点符号 # print item if self.verbose > 1: logging.debug(u'句子(%s)将标点符号:"%s"替换成""' % (sentence, '')) else: words.append(item) sentence = ' '.join(words) # print sentence # print sentence seg_list = jieba.lcut(sentence, cut_all=full_mode) # print seg_list seg_list = [item for item in seg_list if len(item.strip()) != 0] # print seg_list seg_srt = sep.join(seg_list) return seg_srt
] for sent, seg in testlist: print('/'.join(jieba.cut(sent, HMM=False))) word = ''.join(seg) print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True))) print('/'.join(jieba.cut(sent, HMM=False))) print("-"*40) # quit() jieba.add_word('石墨烯') seg_list = jieba.cut(p, cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut(p, cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut(p) # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search(p) # 搜索引擎模式 print(", ".join(seg_list)) print jieba.suggest_freq(('好','我')) print jieba.suggest_freq(('走','了')) print ','.join(jieba.lcut(p)) print ','.join(jieba.lcut_for_search(p)) print ','.join(['%s/%s'%(i,j) for i,j in pseg.lcut(p)])
def extract(input_string): sentences = input_string.split(" ") seg_list = [ item for sentence in sentences for item in pseg.lcut(sentence) ] for word, flag in seg_list: print word, flag return [[word,flag] for word,flag in seg_list]
def __analyse_clause(self, the_clause, runout_filepath, print_show): sub_clause = {"score": 0, "positive": [], "negative": [], "conjunction": [], "punctuation": [], "pattern": []} seg_result = posseg.lcut(the_clause) # 将分句及分词结果写进运行输出文件,以便复查 if runout_filepath is not None: self.__write_runout_file(runout_filepath, the_clause + '\n') self.__write_runout_file(runout_filepath, str(seg_result) + '\n') if print_show: print(the_clause) print(seg_result) # 判断句式:如果……就好了 judgement = self.__is_clause_pattern2(the_clause) if judgement != "": sub_clause["pattern"].append(judgement) sub_clause["score"] -= judgement["value"] return sub_clause # 判断句式:是…不是… judgement = self.__is_clause_pattern1(the_clause) if judgement != "": sub_clause["pattern"].append(judgement) sub_clause["score"] -= judgement["value"] # 判断句式:短语 judgement = self.__is_clause_pattern3(the_clause, seg_result) if judgement != "": sub_clause["score"] += judgement["score"] if judgement["score"] >= 0: sub_clause["positive"].append(judgement) elif judgement["score"] < 0: sub_clause["negative"].append(judgement) match_result = judgement["key"].split(":")[-1] i = 0 while i < len(seg_result): if seg_result[i].word in match_result: if i + 1 == len(seg_result) or seg_result[i + 1].word in match_result: del (seg_result[i]) continue i += 1 # 逐个分析分词 for i in range(len(seg_result)): mark, result = self.__analyse_word(seg_result[i].word, seg_result, i) if mark == 0: continue elif mark == 1: sub_clause["conjunction"].append(result) elif mark == 2: sub_clause["punctuation"].append(result) elif mark == 3: sub_clause["positive"].append(result) sub_clause["score"] += result["score"] elif mark == 4: sub_clause["negative"].append(result) sub_clause["score"] -= result["score"] # 综合连词的情感值 for a_conjunction in sub_clause["conjunction"]: sub_clause["score"] *= a_conjunction["value"] # 综合标点符号的情感值 for a_punctuation in sub_clause["punctuation"]: sub_clause["score"] *= a_punctuation["value"] return sub_clause
def pos_seg(word): # for items in jseg.lcut(word): # print items.flag,items.word return jseg.lcut(word)[0].flag