Exemplo n.º 1
0
def review_model_predict_entities(model_predict_entities):
    word_tag_map = POSTokenizer().word_tag_tab
    idf_freq = TFIDF().idf_freq
    reviewed_entities = defaultdict(list)
    for ent_type, ent_and_sent_list in model_predict_entities.items():
        for ent, sent in ent_and_sent_list:
            start = sent.lower().find(ent)
            if start == -1:
                continue
            start += 1
            end = start + len(ent) - 1
            tokens = jieba.lcut(sent)
            offset = 0
            selected_tokens = []
            for token in tokens:
                offset += len(token)
                if offset >= start:
                    selected_tokens.append(token)
                if offset >= end:
                    break

            fixed_entity = ''.join(selected_tokens)
            fixed_entity = re.sub(r'\d*\.?\d+%$', '', fixed_entity)
            if ent_type == '人物':
                if len(fixed_entity) >= 10:
                    continue
            if len(fixed_entity) <= 1:
                continue
            if re.findall(r'^\d+$', fixed_entity):
                continue
            if word_tag_map.get(fixed_entity,
                                '') == 'v' and idf_freq[fixed_entity] < 7:
                continue
            reviewed_entities[ent_type].append(fixed_entity)
    return reviewed_entities
Exemplo n.º 2
0
 def __init__(self):
     self.token = jieba.Tokenizer()
     file = [
         x.path for x in os.scandir(config.JIEBA_DICT_PATH)
         if x.path.endswith("txt")
     ]
     for fp in file:
         self.token.load_userdict(fp)
     self.pos_token = POSTokenizer(self.token)
Exemplo n.º 3
0
def import_jieba_posseg(dt=None):
    from jieba.posseg import POSTokenizer
    dt_pos = POSTokenizer(tokenizer=dt)

    return dt_pos
Exemplo n.º 4
0
 def __init__(self, config):
     self.config = config
     self.dt = POSTokenizer()
Exemplo n.º 5
0
def jieba_wrap_init():
	global posseg_tok
	posseg_tok = POSTokenizer(jieba.dt)
Exemplo n.º 6
0
	def __init__(self, config):
		print("----------using naive cut tool---------")
		self.config = config
		self.dt = POSTokenizer()
Exemplo n.º 7
0
 def init_config(self, config):
     self.config = config
     self.dt = POSTokenizer()
     self.cut_flag = False
     self.word_type = []
     self.dt.add_word("<SEG>", 10000)
Exemplo n.º 8
0
 def init_config(self, config):
     self.config = config
     self.dt = POSTokenizer()