예제 #1
0
def split_position(self):
    position = self.text.toPlainText()

    # 默认进行第一位置拆分
    if position == "":
        pre_word_word = globe.word.word[:1]
        next_word_word = globe.word.word[1:]

        globe.word.word = pre_word_word
        globe.word.flag = "UnKnown"
        globe.word_all[globe.word_index] = globe.word

        insert_word = pseg.pair(next_word_word, "UnKnown")
        globe.word_all.insert(globe.word_index + 1, insert_word)
        fill(self)
    elif int(position) > len(globe.word.word) - 1:
        msg(self, u"拆分位置越位")
    else:
        pre_word_word = globe.word.word[:int(position)]
        next_word_word = globe.word.word[int(position):]

        globe.word.word = pre_word_word
        globe.word.flag = "UnKnown"
        globe.word_all[globe.word_index] = globe.word

        insert_word = pseg.pair(next_word_word, "UnKnown")
        globe.word_all.insert(globe.word_index + 1, insert_word)
        fill(self)
예제 #2
0
def segWithNerTag(sentence: str):
    soup = BeautifulSoup(sentence, "html5lib")
    pairs = []
    e_count = 0
    temp_str = ""
    for tag in soup.body.contents:
        if isinstance(tag, Tag):
            pairs.extend(segOnly(temp_str))
            from jieba.posseg import pair
            pairs.append(pair(tag.text, tag.name))
            if (e_count == 0):
                position_e1 = len(pairs) - 1
            elif (e_count == 1):
                position_e2 = len(pairs) - 1
            temp_str = ""
            e_count += 1
        elif isinstance(tag, NavigableString):
            temp_str += tag
        if e_count > 2:
            break
    if (e_count > 2):
        return None
    if (e_count != 2):
        return None
    if (len(temp_str) > 0):
        pairs.extend(segOnly(temp_str))
    return pairs, (position_e1, position_e2)
 def __init__(self, t):
     super().__init__(t)
     self.model = [
         pseg.pair(word=x[0].strip().lower(), flag=x[1])
         for x in nltk.pos_tag(nltk.word_tokenize(t))
     ]
     self.model = list(
         filter(lambda x: x.word != ' ' and x.flag[0] != '.', self.model))
예제 #4
0
def tag_words_unique(wordSegWithTagLst):
    tmp = []
    for word, flag in wordSegWithTagLst:
        newPair = pseg.pair(word, flag)
        if newPair in tmp:
            log_exp.debug('duplicate word:%s', word + '/' + flag)
        else:
            tmp.append(newPair)
    return tmp
 def __init__(self, s, bot_name, bot_params):
     super().__init__(s, bot_name, bot_params)
     self.model = [
         pseg.pair(word=x[0].strip().lower(), flag=x[1])
         for x in nltk.pos_tag(nltk.word_tokenize(s))
     ]
     self.model = list(
         filter(lambda x: x.word != ' ' and x.flag[0] != '.', self.model))
     self.target_ids = list(
         compress(range(len(self.model)),
                  [x.word == 'X' for x in self.model]))
예제 #6
0
파일: fenci.py 프로젝트: whsasf/kwm
    def text_map(self, document, allowPOS=('ns', 'n', 'vn', 'v')):
        """
        Extract keywords from sentence using TextRank algorithm.
        Parameter:
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
                        if the POS of w is not in this list, it will be filtered.
            - withFlag: if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        """
        category = document['category']
        doc_text = document['text']
        #print("cutting and words grouping")

        self.pos_filt = frozenset(allowPOS)
        words_per_length = {}
        # cut text into sentences
        sentences = self.cut_sent(doc_text)
        for sentence in sentences:
            try:
                #print([wp for wp in self.tokenizer.cut(sentence)])
                sentence_words = [wp for wp in self.tokenizer.cut(
                    sentence) if self.pairfilter(wp)]
                for sentence_word in sentence_words:
                    word_length = len(sentence_word.word)
                    if word_length not in words_per_length:
                        words_per_length[word_length] = [sentence_word.word]
                    else:
                        words_per_length[word_length].append(
                            sentence_word.word)
                for group_length in range(2, 4):
                    for i, wp in enumerate(sentence_words):
                        if i+group_length > len(sentence_words):
                            break
                        group = sentence_words[i:i+group_length]
                        group_flag = 'v' if 'v' in [
                            _wp.flag[0] for _wp in group] else 'n'
                        group_word = "".join([_wp.word for _wp in group])
                        group_pair = pair(group_word, group_flag)
                        group_word_length = len(group_pair.word)
                        if group_word_length not in words_per_length:
                            words_per_length[group_word_length] = [
                                group_pair.word]
                        else:
                            words_per_length[group_word_length].append(
                                group_pair.word)
            except Exception as e:
                print("error processing document: {}".format(e))
        return words_per_length
예제 #7
0
 def __reencoding(words):
     return [
         pseg.pair(word.word.encode("utf_8"), word.flag) for word in words
     ]
 def __reencoding(words):
     return [pseg.pair(word.word.encode("utf_8"), word.flag) for word in words]
예제 #9
0
 def dict2obj(d):
     return pt.pair(**d)
예제 #10
0
파일: testNLPIR.py 프로젝트: shishi11/nlp
0.113277, -0.0957486, -0.0300693, 0.060407, -0.0755803, 0.0277249, 0.096682, -0.201979, 0.0191415, -0.0368662, 0.188267, -0.100473, 0.200733, -0.133498, -0.18189, 0.0949252,
 0.094788, 0.0885632, 0.0459624, -0.163112, -0.0238738, 0.092492, 0.069917, -0.051785, 0.102056, -0.118892, 0.00748956, 0.0556468, 0.164926, -0.0249743, 0.11822, 0.0454997,
0.06974, -0.0510377, 0.209135, 0.105358, 0.329007, -0.227759, -0.16564, -0.0570431, 0.0358564, 0.0996493, 0.100202, 0.113484, -0.106073, 0.148421, 0.110013, -0.0854412, 0.0911929, -0.15929, -0.0732799, -0.0381998, -0.131911, 0.336764, -0.019322, 0.218907, -0.124265, 0.11631, 0.0652415, -0.00321152, 0.267262, 0.0938914, 0.0431836, 0.0215024, -0.0461433, -0.138609, -0.117927, -0.0468654, 0.0277628, -0.0528721, -0.270519, 0.0272312, 0.128095, -0.0915644, 0.314808, 0.138663, -0.0532469, 0.0758089, -0.129812, -0.028121
, -0.173601, 0.246225, -0.0604251, -0.362903, -0.0490581, 0.105863, 0.0886891, -0.00241011, 0.165262, -0.205267, -0.126526, 0.137786, -0.028438, -0.0101976, 0.0549173, 0.147449, 0.182798, 0.0393628, -0.105425, -0.0834244, -0.116893, -0.125272, 0.198429, -0.088544, -0.0906262, 0.0411189, -0.0335294, -0.0834971, -0.0220685, -0.157328, -0.0626796,
 0.123827, 0.0782014, -0.150602, -0.00504032, -0.111891, 0.0950747, -0.0631985, 0.215788, 0.142335, -0.108046, -0.152028, -0.111842, 0.0504348, 0.0128978, 0.0522977, -0.209837, 0.159769, 0.0381906, 0.00741702, -0.109153, 0.0870181, 0.136201, 0.188177, -0.071041, -0.0132081, -0.116421, -0.0104274, 0.0944911, 0.168561, -0.0980741, -0.0446614, -0.0103312, -0.0608724, -0.0852678, -0.0456349, 0.0559075, -0.0551899, -0.100452, -0.00448219, -0.102358, 0.171519, -0.162117, -0.0900693, -0.101536, -0.00539388, 0.124286, -0.109693, 0.246063, -0.11034, -0.00190697, 0.0892218, 0.106342, 0.12314, -0.0555064, -0.103338, 0.100125, -0.196961, -0.025753, -0.00278403, -0.0981504, -0.0577061, -0.0827057
, 0.0758681, 0.00933263, 0.0561373, -0.238222, 0.0161437, -0.00102386, 0.0864013, 0.0643768, -0.121894, 0.0213798, 0.112736, 0.0209747, -0.0471288, 0.10865, 0.0572824, -0.00464534, -0.0321385, 0.134267, -0.114082, -0.0443887]}

# print(json1)



# pynlpir.open()
s = '在华盛顿期间,习主席还先后会见了前来参加本届核安全峰会的丹麦首相拉斯穆森、韩国总统朴槿惠和阿根廷总统马克里,并出席了伊核问题六国机制领导人会议。'

# li=list(posseg.cut(s))
result=[pair('在', 'p'), pair('华盛顿', 'ns'), pair('期间', 'f'), pair(',', 'x'), pair('习', 'v'), pair('主席', 'n'), pair('还', 'd'), pair('先后', 't'), pair('会见', 'n'), pair('了', 'ul'), pair('前来', 't'), pair('参加', 'v'), pair('本届', 'r'), pair('核', 'n'), pair('安全', 'an'), pair('峰会', 'n'), pair('的', 'uj'), pair('丹麦', 'ns'), pair('首相', 'd'), pair('拉斯', 'nrt'), pair('穆森', 'nr'), pair('、', 'x'), pair('韩国', 'ns'), pair('总统', 'n'), pair('朴槿惠', 'nr'), pair('和', 'c'), pair('阿根廷', 'nr'), pair('总统', 'n'), pair('马克里', 'nr'), pair(',', 'x'), pair('并', 'c'), pair('出席', 'v'), pair('了', 'ul'), pair('伊', 'j'), pair('核', 'n'), pair('问题', 'n'), pair('六', 'm'), pair('国', 'n'), pair('机制', 'n'), pair('领导人', 'n'), pair('会议', 'n'), pair('。', 'x')]
# print(li)
#
# seg=pynlpir.segment(s)
# print(seg)

# json1=client.lexer(s)
# print(json1)
json1={
    'log_id': 6307372633409014593,
    'text': '在华盛顿期间,习主席还先后会见了前来参加本届核安全峰会的丹麦首相拉斯穆森、韩国总统朴槿惠和阿根廷总统马克里,并出席了伊核问题六国机制领导人会议。',
    'items': [{
        'loc_details': [],
        'byte_offset': 0,
        'uri': '',
        'pos': 'p',