Exemplo n.º 1
0
def cut_word(data, filterWord):
    """
    cut word and extract brands After remove advertise articles and stop words
    :param data:
    :param filterWord: stop word and general word such as '的','有'
    :return: cutword type is list format like ['wo xi huan dazhong che ',] a string segmentation is space
    brands  type is list format like ['dazhong','benchi']
    """
    cutword = []
    brands = []
    sentLen = []
    jieba.enable_parallel(30)
    all_num = 0
    cut_num = 0
    for lineSource in data:
        all_num += 1
        outline = trim(lineSource['title']) + '.' + trim(lineSource['text'])
        if 'brands' in lineSource:
            for brand in lineSource['brands']:
                brands.append(brand.encode('utf-8', 'ignore'))
        sentlenth = len(outline.decode('utf-8', 'ignore'))
        sentLen.append(sentlenth)
        if adver_esm.query(outline):
            continue
        if 5 < sentlenth < 4000:
            cutword.append(' '.join([
                i for i in norm_cut(outline) if i not in filterWord
            ]).encode('utf-8'))
            cut_num += 1
    jieba.disable_parallel()
    logger.info('avg sent length %.2f' % (sum(sentLen) / all_num))
    logger.info('all_num is %s;cutnum is %s,percentage is %.2f' %
                (all_num, cut_num, cut_num / all_num))
    return cutword, brands
Exemplo n.º 2
0
    def LtpRecon(self, sents):
        """
        分词,词性,句法,命名实体识别,语义识别
        :param sents:
        :return:
        """
        #分词
        words = [i.encode('utf-8', 'ignore') for i in norm_cut(sents)]
        logger.info('\t'.join(words))
        #词性
        postags = self.postagger.postag(words)
        logger.info('\t'.join(postags))
        #句法
        arcs = self.parser.parse(words, postags)
        logger.info("\t".join("%d:%s" % (arc.head, arc.relation)
                              for arc in arcs))
        #实体识别
        netags = self.recognizer.recognize(words, postags)
        logger.info('\t'.join(netags))
        #语义标注
        roles = self.labeller.label(words, postags, arcs)
        for role in roles:
            print role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ])

        self.words, self.postags, self.arcs, self.netags, self.roles = \
            words, postags, arcs, netags, roles
Exemplo n.º 3
0
    def extract_keyword(self, question):
        '''
            提取四元组:包括产品,组件,评价,服务
        :return:
        '''
        # 替换同义词
        dic = {'relaword': set([])}
        cut_sent = [
            self.replace_word(i.encode('utf-8', 'ignore'))
            for i in norm_cut(question)
        ]
        key_list = [
            self.product_dict, self.component_dict, self.attribute_dict,
            self.evaluation_dict, self.service_dict
        ]
        keynames = [
            'product', 'component', 'attribute', 'evaluation', 'service'
        ]
        # print self.rela_dict
        print self.rela_dict['发动机']
        num = 0
        for sub in key_list:
            extract_word = set(cut_sent) & set(sub.values())
            dic[keynames[num]] = extract_word
            num += 1
            #寻找关联词
            for word in extract_word:
                if word in self.rela_dict:
                    for relaword in set(self.rela_dict[word]) & set(cut_sent):
                        if relaword and relaword not in self.synonym.values():
                            dic['relaword'].add(relaword)
        #格式化 ,全部以列表形式返回
        keynames.append('relaword')
        for name in keynames:
            if dic[name]:
                dic[name] = [i for i in dic[name]]
            else:
                dic[name] = []

        #添加同义词更改后的问句
        dic['new_question'] = ''.join(cut_sent)
        return dic
Exemplo n.º 4
0
    def avg_feature_vector(self, sentence, num_features=300, is_seg=True):
        if is_seg:
            words = list(norm_cut(sentence))
        else:
            words = sentence.split()
            print ' '.join(words)

        feature_vec = np.zeros((num_features,), dtype='float32')
        n_words = 0
        for word in words:
            content = self.model.run(word)
            if content is not None:
                n_words += 1
                vec = np.array(content)
                # print 'type_vec',type(vec)
                feature_vec = np.add(feature_vec, vec)

        if (n_words > 0):
            feature_vec = np.divide(feature_vec, n_words)
        sent_vec = np.ndarray.tolist(feature_vec)
        return json.dumps(sent_vec)
Exemplo n.º 5
0
 def cut_word(self,line):
     return ' '.join([i for i in norm_cut(line) if i.encode('utf-8','ignore') not in self.stopword ])
Exemplo n.º 6
0
 def cut_word(self, sent):
     line = ' '.join([i for i in norm_cut(sent)])
     print line
Exemplo n.º 7
0
 def FreqSug(self, word):
     if len(list(norm_cut(word, HMM=False))) > 1:
         num = suggest_freq(word)
         line = '%s %s\n' % (word, num)
         return line
     return False
Exemplo n.º 8
0
 def cut_word(self, sent):
     return [i for i in norm_cut(sent)]