def cut_word(data, filterWord): """ cut word and extract brands After remove advertise articles and stop words :param data: :param filterWord: stop word and general word such as '的','有' :return: cutword type is list format like ['wo xi huan dazhong che ',] a string segmentation is space brands type is list format like ['dazhong','benchi'] """ cutword = [] brands = [] sentLen = [] jieba.enable_parallel(30) all_num = 0 cut_num = 0 for lineSource in data: all_num += 1 outline = trim(lineSource['title']) + '.' + trim(lineSource['text']) if 'brands' in lineSource: for brand in lineSource['brands']: brands.append(brand.encode('utf-8', 'ignore')) sentlenth = len(outline.decode('utf-8', 'ignore')) sentLen.append(sentlenth) if adver_esm.query(outline): continue if 5 < sentlenth < 4000: cutword.append(' '.join([ i for i in norm_cut(outline) if i not in filterWord ]).encode('utf-8')) cut_num += 1 jieba.disable_parallel() logger.info('avg sent length %.2f' % (sum(sentLen) / all_num)) logger.info('all_num is %s;cutnum is %s,percentage is %.2f' % (all_num, cut_num, cut_num / all_num)) return cutword, brands
def LtpRecon(self, sents): """ 分词,词性,句法,命名实体识别,语义识别 :param sents: :return: """ #分词 words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)] logger.info('\t'.join(words)) #词性 postags = self.postagger.postag(words) logger.info('\t'.join(postags)) #句法 arcs = self.parser.parse(words, postags) logger.info("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) #实体识别 netags = self.recognizer.recognize(words, postags) logger.info('\t'.join(netags)) #语义标注 roles = self.labeller.label(words, postags, arcs) for role in roles: print role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ]) self.words, self.postags, self.arcs, self.netags, self.roles = \ words, postags, arcs, netags, roles
def extract_keyword(self, question): ''' 提取四元组:包括产品,组件,评价,服务 :return: ''' # 替换同义词 dic = {'relaword': set([])} cut_sent = [ self.replace_word(i.encode('utf-8', 'ignore')) for i in norm_cut(question) ] key_list = [ self.product_dict, self.component_dict, self.attribute_dict, self.evaluation_dict, self.service_dict ] keynames = [ 'product', 'component', 'attribute', 'evaluation', 'service' ] # print self.rela_dict print self.rela_dict['发动机'] num = 0 for sub in key_list: extract_word = set(cut_sent) & set(sub.values()) dic[keynames[num]] = extract_word num += 1 #寻找关联词 for word in extract_word: if word in self.rela_dict: for relaword in set(self.rela_dict[word]) & set(cut_sent): if relaword and relaword not in self.synonym.values(): dic['relaword'].add(relaword) #格式化 ,全部以列表形式返回 keynames.append('relaword') for name in keynames: if dic[name]: dic[name] = [i for i in dic[name]] else: dic[name] = [] #添加同义词更改后的问句 dic['new_question'] = ''.join(cut_sent) return dic
def avg_feature_vector(self, sentence, num_features=300, is_seg=True): if is_seg: words = list(norm_cut(sentence)) else: words = sentence.split() print ' '.join(words) feature_vec = np.zeros((num_features,), dtype='float32') n_words = 0 for word in words: content = self.model.run(word) if content is not None: n_words += 1 vec = np.array(content) # print 'type_vec',type(vec) feature_vec = np.add(feature_vec, vec) if (n_words > 0): feature_vec = np.divide(feature_vec, n_words) sent_vec = np.ndarray.tolist(feature_vec) return json.dumps(sent_vec)
def cut_word(self,line): return ' '.join([i for i in norm_cut(line) if i.encode('utf-8','ignore') not in self.stopword ])
def cut_word(self, sent): line = ' '.join([i for i in norm_cut(sent)]) print line
def FreqSug(self, word): if len(list(norm_cut(word, HMM=False))) > 1: num = suggest_freq(word) line = '%s %s\n' % (word, num) return line return False
def cut_word(self, sent): return [i for i in norm_cut(sent)]