Пример #1
0
def result_format():
    HanLP.Config.ShowTermNature = False
    seg = HanLP.newSegment()
    print(seg.seg(sentences[0]))
    HanLP.Config.ShowTermNature = True
    seg = HanLP.newSegment()
    term_list = seg.seg(sentences[0])
    print(term_list)
    print([str(i.word) for i in term_list])
    print([str(i.nature) for i in term_list])
Пример #2
0
    def pos_filter(self, s):
        if not s:
            return []
        wds = [w.word for w in HanLP.segment(s)]
        pos = [str(w.nature) for w in HanLP.segment(s) if w.nature]

        if len(''.join(wds)) < 2:
            return []
        if 'n' not in pos and 'nhd' not in pos:
            return []
        return ''.join(wds)
Пример #3
0
def extractSummary(document, size, sentence_separator=None):
    """
      * 自动摘要
     *
     * @param document           目标文档
     * @param size               需要的关键句的个数
     * @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;]
     * @return 关键句列表
    """
    if sentence_separator:
        return HanLP.extractSummary(document, size, sentence_separator)
    else:
        return HanLP.extractSummary(document, size)
Пример #4
0
def make_index():
    with open(ITEM_INDEX_JSON, 'w', encoding='utf8') as item_index_file, \
            open(ITEM_SOURCE_JSON, 'r', encoding='utf8') as item_file:

        item_js = json.load(item_file)
        all_info = item_js['RECORDS']
        for item in all_info:
            title = item['TITLE']
            ITEM_DICT[item['ENTERPRISE_ID']]['org_id'] = item['ORG_ID']
            if 'items' not in ITEM_DICT[item['ENTERPRISE_ID']]:
                ITEM_DICT[item['ENTERPRISE_ID']]['items'] = set()
            # TODO: segment and filter here.
            segs = HanLP.segment(title)
            for word in segs:
                _word = word.word
                nature = str(word.nature)
                if nature in ['vn', 'vi']:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)
                elif nature == 'v' and _word in V_SET:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)
                elif nature in [
                        'n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba'
                ] and _word not in FIL_SET:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)

        for key in ITEM_DICT.keys():
            ITEM_DICT[key]['items'] = list(ITEM_DICT[key]['items'])
        js_info = json.dumps(ITEM_DICT)
        item_index_file.write(js_info)

    with open(TYPE_INDEX_JSON, 'w', encoding='utf8') as type_index_file, \
            open(TYPE_SOURCE_JSON, 'r', encoding='utf8') as type_file:

        type_js = json.load(type_file)
        all_info = type_js['RECORDS']
        for item in filter(lambda x: len(x['CODE']) == 9, all_info):
            TYPE_DICT[item['CODE']] = set()
            if item['SERVICETYPEVALUE']:
                value_words = HanLP.segment(item['SERVICETYPEVALUE'])
                for word in value_words:
                    TYPE_DICT[item['CODE']].add(word.word)
            if item['KEYWORD']:
                key_words = HanLP.segment(item['KEYWORD'])
                for word in key_words:
                    TYPE_DICT[item['CODE']].add(word.word)
        # convert set to list
        for k in TYPE_DICT.keys():
            TYPE_DICT[k] = list(TYPE_DICT[k])

        js_info = json.dumps(TYPE_DICT)
        type_index_file.write(js_info)
Пример #5
0
def getSummary(document, max_length, sentence_separator=None):
    """
     * 自动摘要
     *
     * @param document           目标文档
     * @param max_length         需要摘要的长度
     * @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;]
     * @return 摘要文本

    """
    if sentence_separator:
        return HanLP.getSummary(document, max_length, sentence_separator)
    else:
        return HanLP.getSummary(document, max_length)
Пример #6
0
def get_hanlp_entity_weight_dict(prep_article,
                                 entity_type,
                                 sentence_type='original',
                                 sentence_count=4):
    """
    对每篇文章title和中心句子提取 subject/object/predicate, 并对对应的类型计算每个词的权重
    :param prep_article: PreprocessArticle类的实例
    :param entity_type: 提取词的类型,取值为 sub/obj/predicate, 分别代表 subject, object, predicate
    :param sentence_type: 待提取的句子的排序方法,取值为 original/score, 分别代表 文章原始句子顺序,文章句子评分排序
    :param sentence_count: 中心句的个数,默认为4,如果为0,这只对title进行提取
    :return: subject/object/predicate的词-权重字典
    """
    entitis = []
    # 文章title
    words = HanLP.parseDependency(prep_article.title).word
    if entity_type == 'sub':
        entitis.append(get_hanlp_sub_entity(words))
    if entity_type == 'obj':
        entitis.append(get_hanlp_obj_entity(words))
    if entity_type == 'predicate':
        entitis.append(get_hanlp_predicate_entity(words))
    # 文章句子
    if sentence_count > 0:
        # 文章前n个句子
        if sentence_type == 'original':
            for i, sentence in enumerate(prep_article.sentences):
                if i < sentence_count:
                    words = HanLP.parseDependency(sentence.text).word
                    if entity_type == 'sub':
                        entitis.append(get_hanlp_sub_entity(words))
                    if entity_type == 'obj':
                        entitis.append(get_hanlp_obj_entity(words))
                    if entity_type == 'predicate':
                        entitis.append(get_hanlp_predicate_entity(words))
        # 文章得分降序前n个句子
        if sentence_type == 'score':
            for i, idx in enumerate(prep_article.descend_sentence_index):
                if i < sentence_count:
                    words = HanLP.parseDependency(
                        prep_article.sentences[idx].text).word
                    if entity_type == 'sub':
                        entitis.append(get_hanlp_sub_entity(words))
                    if entity_type == 'obj':
                        entitis.append(get_hanlp_obj_entity(words))
                    if entity_type == 'predicate':
                        entitis.append(get_hanlp_predicate_entity(words))

    entity_weight_dict = calculate_weight(entitis)
    return entity_weight_dict
Пример #7
0
def dependency_analysis(sent):
    result = HanLP.parseDependency(sent)
    ROOT, SUBJECT, PREDICATE = '核心关系', '主谓关系', '宾'
    res = dict()
    key = ['root', 'sub', 'pre', 'sub_adj', 'pre_adj', 'entity']
    for word in result.iterator():
        type = str(word.DEPREL)
        if type.find(ROOT) >= 0:
            res['root'] = word.LEMMA
        elif type.find(SUBJECT) >= 0:
            res['sub'] = word.LEMMA
        elif type.find(PREDICATE) >= 0:
            res['pre'] = word.LEMMA
    res['entity'] = []
    for word in result.iterator():
        if str(word.CPOSTAG).find('n') >= 0 and str(
                word.CPOSTAG).find('v') < 0:
            res['entity'].append(word.LEMMA)
        if res.get('sub') and str(word.HEAD.LEMMA) == str(res['sub']):
            res['sub_adj'] = res['sub_adj'] + [word.LEMMA] if res.get(
                'sub_adj') else [word.LEMMA]
        else:
            res['pre_adj'] = res['pre_adj'] + [word.LEMMA] if res.get(
                'pre_adj') else [word.LEMMA]
    for k in key:
        res[k] = res.get(k, '空')
        if isinstance(res[k], list):
            res[k] = '|'.join(res[k])
    print(res)
    return res
Пример #8
0
    def split1list(self, sentence):
        line = sentence.strip().decode(
            'utf-8', 'ignore')  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        line1 = re.sub(
            "[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+".
            decode("utf8"), " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词
        wordList = HanLP.segment(line1.strip())
        poslist = set()
        for w in wordList:
            length = len(w.word)
            nature = str(w.nature)
            if length < 2 and nature.__contains__('w'):
                continue

            if w.word in self.stopwords:
                preflag = None
                continue

            #if self.isFormWord(nature):
            #    continue

            #wordpos = w.word + '   ' + nature
            #self.wordposlist.append(wordpos)

            poslist.add(w.word)

        return poslist
Пример #9
0
    def pinyin(self, sentence):
        pinyinlist = HanLP.convertToPinyinList(sentence)
        res = []
        for pinyin in pinyinlist:
            res.append(str(pinyin))

        return ''.join(res)
Пример #10
0
def get_keywords(query, par_dict, sim_dic):
    _words = HanLP.segment(query)
    temp = []
    added = []
    keywords = []
    visited = set()

    for word in _words:
        _word = word.word
        nature = str(word.nature)
        if _word in SAVED:
            temp.append(_word)
        elif nature in ['vn', 'vi']:
            temp.append(_word)
        elif nature == 'v' and _word in V_SET:
            temp.append(_word)
        elif nature in ['n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba'
                        ] and _word not in FIL_SET and len(_word) > 1:
            temp.append(_word)
    for item in temp:
        added.append((item, 1.5))
        if item in par_dict:
            added.append((par_dict[item], 1))
        if item in sim_dic[0]:
            for sim in sim_dic[1][sim_dic[0][item]]:
                added.append((sim, 1))
    for item in added:
        if item[0] not in visited:
            keywords.append(item)
            visited.add(item)
    return keywords
Пример #11
0
def get_abstract_sentence(sentence, vocabulary):
    '''
    句子抽象化
    电影名 nm
    演员名 nnt
    电影类型 ng
    紧跟演员名之后的演员名 nnr
    评分 x
    '''
    abstract_sentence = []
    query_dict = {}
    second = False
    for segment in HanLP.segment(sentence):
        word = str(segment.word)
        nature = str(segment.nature)
        if nature == "nm":
            query_dict["nm"] = word
            word == "nm"
        elif nature == "nnt" and not second:
            query_dict["nnt"] = word
            word == "nnt"
            second = True
        elif nature == "ng":
            query_dict["ng"] = word
            word = "ng"
        elif nature == "m":
            query_dict["x"] = word
            word = "x"
        elif nature == "nnt" and second:
            query_dict["nnr"] = word
            word = "nnr"
            second = False
        if word in vocabulary:
            abstract_sentence.append(word)
    return abstract_sentence, query_dict
Пример #12
0
 def __iter__(self):
     """make each sentence a new line"""
     normed_sent = preprocess(self.strings)
     for sent in split_iter(normed_sent, self.eos_placement):
         sent = ''.join(sent)
         if sent:
             yield list(term.word for term in HanLP.segment(sent))
def ws(filename, convert2zh=False):

    if not os.path.exists(REPO_DIR):
        os.makedirs(REPO_DIR)

    file = os.path.join(REPO_DIR, filename)
    fw = codecs.open(file + '.seg.sc', 'w', encoding = 'utf-8')
    
    regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+')

    with codecs.open(file, 'r', encoding = 'utf-8') as fr:
        for line in fr:
            line = line.split('\t', 1)[1].strip().replace('“', '').replace('”', '')
            line = clean(line)
            _list = regex.findall(line.strip())
            seq = ''
            for span in _list:
                result = analyzer.analyze(span)
                for terms in result.toSimpleWordList():
                    field = terms.toString().split('/')
                    word = field[0] if not convert2zh else HanLP.convertToTraditionalChinese(field[0])
                    pos = field[1]
                    seq += word.lower() + '_' + pos + ' '

                seq += ',_, '
                    
            fw.write(seq.rsplit('_', 1)[0][:-1] + '。_。\n')

    fw.close()
Пример #14
0
 def dependency_parse(self, sent, standard_name=False, stopwords=None):
     """
     依存句法分析,调用pyhanlp的接口,并且融入了harvesttext的实体识别机制。
     不保证高准确率。
     :param sent:
     :param standard_name:
     :param stopwords:
     :return: arcs:依存弧,列表中的列表。
     [[词语id,词语字面值或实体名(standard_name控制),词性,依存关系,依存子词语id] for 每个词语]
     """
     from pyhanlp import HanLP, JClass
     if not self.hanlp_prepared:
         self.hanlp_prepare()
     self.standard_name = standard_name
     entities_info = self.entity_linking(sent)
     sent2 = self.decoref(sent, entities_info)
     # [word.ID-1, word.LEMMA, word.POSTAG, word.DEPREL ,word.HEAD.ID-1]
     arcs = []
     i = 0
     sentence = HanLP.parseDependency(sent2)
     for word in sentence.iterator():
         word0, tag0 = word.LEMMA, word.POSTAG
         if stopwords and word0 in stopwords:
             continue
         if word0 in self.entity_types:
             if self.standard_name:
                 word0 = entities_info[i][1][0]  # 使用链接的实体
             else:
                 l, r = entities_info[i][0]  # 或使用原文
                 word0 = sent[l:r]
             tag0 = entities_info[i][1][1][1:-1]
             i += 1
         arcs.append([word.ID-1, word0, tag0, word.DEPREL, word.HEAD.ID-1])
     return arcs
Пример #15
0
def input_pipeline(sentence, lang, bpe=None):
    """
    1. 分词(zh)
    2. 转小写(en)
    3. tokenzie
    4. bpe
    """
    if lang == 'zh':
        seg = [term.word for term in HanLP.segment(sentence)]
        seg_str = ' '.join(seg)
        #print('分词后:', seg)
        mt = MosesTokenizer(lang='zh')
        tokenized_str = mt.tokenize(seg_str, return_str=True)
        #print('tokenize后;',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后:', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    elif lang == 'en':
        lower = sentence.lower()
        #print('小写后:'. lower)
        mt = MosesTokenizer(lang='en')
        tokenized_str = mt.tokenize(lower, return_str=True)
        #print('tokenize后;',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后:', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    else:
        raise Exception
Пример #16
0
def show_words():
    sql = 'SELECT * FROM NEWSWB'
    lock.acquire()
    cursor.execute(sql)
    lock.release()
    news = cursor.fetchone()
    print(news[5], '>>>>>>>', HanLP.extractKeyword(news[5], 5))
Пример #17
0
    def get_sentence_mapping(self, overload=False):
        """
        句子映射表
        :return:    vec_space,如: {'我们是中国人,我们爱自己的祖国':[......], '蜀道难,难于上青天':[......]}
        """
        sentence_to_vec_file = current_path + '/sentence_mapping.pkl'
        if not os.path.isfile(sentence_to_vec_file) or overload:
            print('首次加载句子时间较长,请稍等......')
            sentence_to_vec = {}
            for sentence in self.sentence_list:
                tmp = np.zeros(shape=self.dim)
                index = 0
                for obj in HanLP.segment(sentence):
                    word = obj.word
                    if word in self.char_mapping:
                        tmp += self.char_mapping[word]
                    else:
                        tmp += np.zeros(shape=self.dim)
                    index += 1
                tmp /= index
                sentence_to_vec[sentence] = tmp

                with open(sentence_to_vec_file, 'wb') as f:
                    pickle.dump(sentence_to_vec, f)
                    f.close()

        else:
            with open(sentence_to_vec_file, 'rb') as f:
                sentence_to_vec = pickle.load(f)
                f.close()

        return sentence_to_vec
Пример #18
0
def add_to_dictionary(word, part, mod=0):
    result = CustomDictionary.add(word, part)
    if not result and mod:
        CustomDictionary.insert(word, part)
    text = "我用天猫交社保"
    print(HanLP.segment(text))
    return result
Пример #19
0
def raw_seg():
    """
    newSegment()支持下列多种模式,默认使用viterbi
    维特比 (viterbi):效率和效果的最佳平衡。也是最短路分词,HanLP最短路求解采用Viterbi算法
    双数组trie树 (dat):极速词典分词,千万字符每秒(可能无法获取词性,此处取决于你的词典)
    条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务
    感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习
    N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度
    """
    seg = HanLP.newSegment()
    for st in sentences:
        print(seg.seg(st))

    seg_crf = HanLP.newSegment("crf")
    for st in sentences:
        print(seg_crf.seg(st))
    """
Пример #20
0
def get_keyword(content,keynum=2):
    """
    获取每个问题中的关键字,关键词的数目由keynum控制
    :param content: 一个句子
    :return:
    """
    keywordList = HanLP.extractKeyword(content,keynum)
    return keywordList
Пример #21
0
 def segment(self, text):
     word_tag_list = HanLP.segment(text)
     word_list = []
     for word_tag in word_tag_list:
         word, tag = str(word_tag).split('/')
         if tag=='n':
             word_list.append(word)
     return word_list
Пример #22
0
def convertToSimplifiedChinese(traditionalChineseString):
    """
     * 繁转简
     *
     * @param traditionalChineseString 繁体中文
     * @return 简体中文
    """
    return HanLP.convertToSimplifiedChinese(traditionalChineseString)
Пример #23
0
 def test_custom_dict_forcing(self):
     segment = HanLP.newSegment('viterbi')
     CustomDictionary.insert('川普', 'nr 1')
     self.assertIn('四川/ns, 普通人/n, 与/cc, 川/b, 普通/a, 电话/n',
                   segment.seg('四川普通人与川普通电话').__str__())
     segment.enableCustomDictionaryForcing(True)
     self.assertIn('四川/ns, 普通人/n, 与/cc, 川普/nr, 通电话/vi',
                   segment.seg('四川普通人与川普通电话').__str__())
Пример #24
0
 def load_data(self, file):
     result = []
     with open(file, mode='r', encoding="utf-8") as fp:
         lines = fp.readlines()
         for line in lines:
             words = HanLP.segment(str(line).strip())
             result.append(" ".join([str(i.word) for i in words]))
     return result
Пример #25
0
def hanlp_recognize(text):

    # segment = HanLP.newSegment().enableNameRecognize(True)
    # segment = HanLP.newSegment().enableTranslatedNameRecognize(True)
    # segment = HanLP.newSegment().enablePlaceRecognize(True)
    segment = HanLP.newSegment().enableOrganizationRecognize(True)
    term_list = segment.seg(text)
    print(term_list)
def segment(text):
    '''
    使用HanLP对中文句子进行分词
    '''
    try:
        seg_result = hanlp.segment(text)
        return [term.word for term in seg_result]
    except Exception:
        return text.split()
Пример #27
0
def parseDependency(sentence):
    """
     * 依存文法分析
     *
     * @param sentence 待分析的句子
     * @return CoNLL格式的依存关系树

    """
    return HanLP.parseDependency(sentence)
Пример #28
0
def convertToPinyinList(text):
    """
     * 转化为拼音
     *
     * @param text 待解析的文本
     * @return 一个拼音列表

    """
    return HanLP.convertToPinyinList(text)
Пример #29
0
def convertToTraditionalChinese(simplifiedChineseString):
    """
     * 简转繁
     *
     * @param simplifiedChineseString 简体中文
     * @return 繁体中文

    """
    return HanLP.convertToTraditionalChinese(simplifiedChineseString)
Пример #30
0
def extractKeyword(document, size):
    """
    * 提取关键词
     *
     * @param document 文档内容
     * @param size     希望提取几个关键词
     * @return 一个列表
    """
    return HanLP.extractKeyword(document, size)