def split_position(self): position = self.text.toPlainText() # 默认进行第一位置拆分 if position == "": pre_word_word = globe.word.word[:1] next_word_word = globe.word.word[1:] globe.word.word = pre_word_word globe.word.flag = "UnKnown" globe.word_all[globe.word_index] = globe.word insert_word = pseg.pair(next_word_word, "UnKnown") globe.word_all.insert(globe.word_index + 1, insert_word) fill(self) elif int(position) > len(globe.word.word) - 1: msg(self, u"拆分位置越位") else: pre_word_word = globe.word.word[:int(position)] next_word_word = globe.word.word[int(position):] globe.word.word = pre_word_word globe.word.flag = "UnKnown" globe.word_all[globe.word_index] = globe.word insert_word = pseg.pair(next_word_word, "UnKnown") globe.word_all.insert(globe.word_index + 1, insert_word) fill(self)
def segWithNerTag(sentence: str): soup = BeautifulSoup(sentence, "html5lib") pairs = [] e_count = 0 temp_str = "" for tag in soup.body.contents: if isinstance(tag, Tag): pairs.extend(segOnly(temp_str)) from jieba.posseg import pair pairs.append(pair(tag.text, tag.name)) if (e_count == 0): position_e1 = len(pairs) - 1 elif (e_count == 1): position_e2 = len(pairs) - 1 temp_str = "" e_count += 1 elif isinstance(tag, NavigableString): temp_str += tag if e_count > 2: break if (e_count > 2): return None if (e_count != 2): return None if (len(temp_str) > 0): pairs.extend(segOnly(temp_str)) return pairs, (position_e1, position_e2)
def __init__(self, t): super().__init__(t) self.model = [ pseg.pair(word=x[0].strip().lower(), flag=x[1]) for x in nltk.pos_tag(nltk.word_tokenize(t)) ] self.model = list( filter(lambda x: x.word != ' ' and x.flag[0] != '.', self.model))
def tag_words_unique(wordSegWithTagLst): tmp = [] for word, flag in wordSegWithTagLst: newPair = pseg.pair(word, flag) if newPair in tmp: log_exp.debug('duplicate word:%s', word + '/' + flag) else: tmp.append(newPair) return tmp
def __init__(self, s, bot_name, bot_params): super().__init__(s, bot_name, bot_params) self.model = [ pseg.pair(word=x[0].strip().lower(), flag=x[1]) for x in nltk.pos_tag(nltk.word_tokenize(s)) ] self.model = list( filter(lambda x: x.word != ' ' and x.flag[0] != '.', self.model)) self.target_ids = list( compress(range(len(self.model)), [x.word == 'X' for x in self.model]))
def text_map(self, document, allowPOS=('ns', 'n', 'vn', 'v')): """ Extract keywords from sentence using TextRank algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. if the POS of w is not in this list, it will be filtered. - withFlag: if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ category = document['category'] doc_text = document['text'] #print("cutting and words grouping") self.pos_filt = frozenset(allowPOS) words_per_length = {} # cut text into sentences sentences = self.cut_sent(doc_text) for sentence in sentences: try: #print([wp for wp in self.tokenizer.cut(sentence)]) sentence_words = [wp for wp in self.tokenizer.cut( sentence) if self.pairfilter(wp)] for sentence_word in sentence_words: word_length = len(sentence_word.word) if word_length not in words_per_length: words_per_length[word_length] = [sentence_word.word] else: words_per_length[word_length].append( sentence_word.word) for group_length in range(2, 4): for i, wp in enumerate(sentence_words): if i+group_length > len(sentence_words): break group = sentence_words[i:i+group_length] group_flag = 'v' if 'v' in [ _wp.flag[0] for _wp in group] else 'n' group_word = "".join([_wp.word for _wp in group]) group_pair = pair(group_word, group_flag) group_word_length = len(group_pair.word) if group_word_length not in words_per_length: words_per_length[group_word_length] = [ group_pair.word] else: words_per_length[group_word_length].append( group_pair.word) except Exception as e: print("error processing document: {}".format(e)) return words_per_length
def __reencoding(words): return [ pseg.pair(word.word.encode("utf_8"), word.flag) for word in words ]
def __reencoding(words): return [pseg.pair(word.word.encode("utf_8"), word.flag) for word in words]
def dict2obj(d): return pt.pair(**d)
0.113277, -0.0957486, -0.0300693, 0.060407, -0.0755803, 0.0277249, 0.096682, -0.201979, 0.0191415, -0.0368662, 0.188267, -0.100473, 0.200733, -0.133498, -0.18189, 0.0949252, 0.094788, 0.0885632, 0.0459624, -0.163112, -0.0238738, 0.092492, 0.069917, -0.051785, 0.102056, -0.118892, 0.00748956, 0.0556468, 0.164926, -0.0249743, 0.11822, 0.0454997, 0.06974, -0.0510377, 0.209135, 0.105358, 0.329007, -0.227759, -0.16564, -0.0570431, 0.0358564, 0.0996493, 0.100202, 0.113484, -0.106073, 0.148421, 0.110013, -0.0854412, 0.0911929, -0.15929, -0.0732799, -0.0381998, -0.131911, 0.336764, -0.019322, 0.218907, -0.124265, 0.11631, 0.0652415, -0.00321152, 0.267262, 0.0938914, 0.0431836, 0.0215024, -0.0461433, -0.138609, -0.117927, -0.0468654, 0.0277628, -0.0528721, -0.270519, 0.0272312, 0.128095, -0.0915644, 0.314808, 0.138663, -0.0532469, 0.0758089, -0.129812, -0.028121 , -0.173601, 0.246225, -0.0604251, -0.362903, -0.0490581, 0.105863, 0.0886891, -0.00241011, 0.165262, -0.205267, -0.126526, 0.137786, -0.028438, -0.0101976, 0.0549173, 0.147449, 0.182798, 0.0393628, -0.105425, -0.0834244, -0.116893, -0.125272, 0.198429, -0.088544, -0.0906262, 0.0411189, -0.0335294, -0.0834971, -0.0220685, -0.157328, -0.0626796, 0.123827, 0.0782014, -0.150602, -0.00504032, -0.111891, 0.0950747, -0.0631985, 0.215788, 0.142335, -0.108046, -0.152028, -0.111842, 0.0504348, 0.0128978, 0.0522977, -0.209837, 0.159769, 0.0381906, 0.00741702, -0.109153, 0.0870181, 0.136201, 0.188177, -0.071041, -0.0132081, -0.116421, -0.0104274, 0.0944911, 0.168561, -0.0980741, -0.0446614, -0.0103312, -0.0608724, -0.0852678, -0.0456349, 0.0559075, -0.0551899, -0.100452, -0.00448219, -0.102358, 0.171519, -0.162117, -0.0900693, -0.101536, -0.00539388, 0.124286, -0.109693, 0.246063, -0.11034, -0.00190697, 0.0892218, 0.106342, 0.12314, -0.0555064, -0.103338, 0.100125, -0.196961, -0.025753, -0.00278403, -0.0981504, -0.0577061, -0.0827057 , 0.0758681, 0.00933263, 0.0561373, -0.238222, 0.0161437, -0.00102386, 0.0864013, 0.0643768, -0.121894, 0.0213798, 0.112736, 0.0209747, -0.0471288, 0.10865, 0.0572824, -0.00464534, -0.0321385, 0.134267, -0.114082, -0.0443887]} # print(json1) # pynlpir.open() s = '在华盛顿期间,习主席还先后会见了前来参加本届核安全峰会的丹麦首相拉斯穆森、韩国总统朴槿惠和阿根廷总统马克里,并出席了伊核问题六国机制领导人会议。' # li=list(posseg.cut(s)) result=[pair('在', 'p'), pair('华盛顿', 'ns'), pair('期间', 'f'), pair(',', 'x'), pair('习', 'v'), pair('主席', 'n'), pair('还', 'd'), pair('先后', 't'), pair('会见', 'n'), pair('了', 'ul'), pair('前来', 't'), pair('参加', 'v'), pair('本届', 'r'), pair('核', 'n'), pair('安全', 'an'), pair('峰会', 'n'), pair('的', 'uj'), pair('丹麦', 'ns'), pair('首相', 'd'), pair('拉斯', 'nrt'), pair('穆森', 'nr'), pair('、', 'x'), pair('韩国', 'ns'), pair('总统', 'n'), pair('朴槿惠', 'nr'), pair('和', 'c'), pair('阿根廷', 'nr'), pair('总统', 'n'), pair('马克里', 'nr'), pair(',', 'x'), pair('并', 'c'), pair('出席', 'v'), pair('了', 'ul'), pair('伊', 'j'), pair('核', 'n'), pair('问题', 'n'), pair('六', 'm'), pair('国', 'n'), pair('机制', 'n'), pair('领导人', 'n'), pair('会议', 'n'), pair('。', 'x')] # print(li) # # seg=pynlpir.segment(s) # print(seg) # json1=client.lexer(s) # print(json1) json1={ 'log_id': 6307372633409014593, 'text': '在华盛顿期间,习主席还先后会见了前来参加本届核安全峰会的丹麦首相拉斯穆森、韩国总统朴槿惠和阿根廷总统马克里,并出席了伊核问题六国机制领导人会议。', 'items': [{ 'loc_details': [], 'byte_offset': 0, 'uri': '', 'pos': 'p',