Python HanLP примеры, pyhanlp.HanLP Python примеры использования

Пример #1

0

Показать файл

Файл: segements.py Проект: oushu1zhangxiangxuan1/HolmesNER

def result_format():
    HanLP.Config.ShowTermNature = False
    seg = HanLP.newSegment()
    print(seg.seg(sentences[0]))
    HanLP.Config.ShowTermNature = True
    seg = HanLP.newSegment()
    term_list = seg.seg(sentences[0])
    print(term_list)
    print([str(i.word) for i in term_list])
    print([str(i.nature) for i in term_list])

Пример #2

0

Показать файл

Файл: rule.py Проект: liwan1698/JadeEmperor

    def pos_filter(self, s):
        if not s:
            return []
        wds = [w.word for w in HanLP.segment(s)]
        pos = [str(w.nature) for w in HanLP.segment(s) if w.nature]

        if len(''.join(wds)) < 2:
            return []
        if 'n' not in pos and 'nhd' not in pos:
            return []
        return ''.join(wds)

Пример #3

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def extractSummary(document, size, sentence_separator=None):
    """
      * 自动摘要
     *
     * @param document           目标文档
     * @param size               需要的关键句的个数
     * @param sentence_separator 分割目标文档时的句子分割符，正则格式， 如：[。？?！!；;]
     * @return 关键句列表
    """
    if sentence_separator:
        return HanLP.extractSummary(document, size, sentence_separator)
    else:
        return HanLP.extractSummary(document, size)

Пример #4

0

Показать файл

def make_index():
    with open(ITEM_INDEX_JSON, 'w', encoding='utf8') as item_index_file, \
            open(ITEM_SOURCE_JSON, 'r', encoding='utf8') as item_file:

        item_js = json.load(item_file)
        all_info = item_js['RECORDS']
        for item in all_info:
            title = item['TITLE']
            ITEM_DICT[item['ENTERPRISE_ID']]['org_id'] = item['ORG_ID']
            if 'items' not in ITEM_DICT[item['ENTERPRISE_ID']]:
                ITEM_DICT[item['ENTERPRISE_ID']]['items'] = set()
            # TODO: segment and filter here.
            segs = HanLP.segment(title)
            for word in segs:
                _word = word.word
                nature = str(word.nature)
                if nature in ['vn', 'vi']:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)
                elif nature == 'v' and _word in V_SET:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)
                elif nature in [
                        'n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba'
                ] and _word not in FIL_SET:
                    ITEM_DICT[item['ENTERPRISE_ID']]['items'].add(_word)

        for key in ITEM_DICT.keys():
            ITEM_DICT[key]['items'] = list(ITEM_DICT[key]['items'])
        js_info = json.dumps(ITEM_DICT)
        item_index_file.write(js_info)

    with open(TYPE_INDEX_JSON, 'w', encoding='utf8') as type_index_file, \
            open(TYPE_SOURCE_JSON, 'r', encoding='utf8') as type_file:

        type_js = json.load(type_file)
        all_info = type_js['RECORDS']
        for item in filter(lambda x: len(x['CODE']) == 9, all_info):
            TYPE_DICT[item['CODE']] = set()
            if item['SERVICETYPEVALUE']:
                value_words = HanLP.segment(item['SERVICETYPEVALUE'])
                for word in value_words:
                    TYPE_DICT[item['CODE']].add(word.word)
            if item['KEYWORD']:
                key_words = HanLP.segment(item['KEYWORD'])
                for word in key_words:
                    TYPE_DICT[item['CODE']].add(word.word)
        # convert set to list
        for k in TYPE_DICT.keys():
            TYPE_DICT[k] = list(TYPE_DICT[k])

        js_info = json.dumps(TYPE_DICT)
        type_index_file.write(js_info)

Пример #5

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def getSummary(document, max_length, sentence_separator=None):
    """
     * 自动摘要
     *
     * @param document           目标文档
     * @param max_length         需要摘要的长度
     * @param sentence_separator 分割目标文档时的句子分割符，正则格式， 如：[。？?！!；;]
     * @return 摘要文本

    """
    if sentence_separator:
        return HanLP.getSummary(document, max_length, sentence_separator)
    else:
        return HanLP.getSummary(document, max_length)

Пример #6

0

Показать файл

def get_hanlp_entity_weight_dict(prep_article,
                                 entity_type,
                                 sentence_type='original',
                                 sentence_count=4):
    """
    对每篇文章title和中心句子提取 subject/object/predicate, 并对对应的类型计算每个词的权重
    :param prep_article: PreprocessArticle类的实例
    :param entity_type: 提取词的类型，取值为 sub/obj/predicate, 分别代表 subject, object, predicate
    :param sentence_type: 待提取的句子的排序方法，取值为 original/score, 分别代表 文章原始句子顺序，文章句子评分排序
    :param sentence_count: 中心句的个数，默认为4，如果为0，这只对title进行提取
    :return: subject/object/predicate的词-权重字典
    """
    entitis = []
    # 文章title
    words = HanLP.parseDependency(prep_article.title).word
    if entity_type == 'sub':
        entitis.append(get_hanlp_sub_entity(words))
    if entity_type == 'obj':
        entitis.append(get_hanlp_obj_entity(words))
    if entity_type == 'predicate':
        entitis.append(get_hanlp_predicate_entity(words))
    # 文章句子
    if sentence_count > 0:
        # 文章前n个句子
        if sentence_type == 'original':
            for i, sentence in enumerate(prep_article.sentences):
                if i < sentence_count:
                    words = HanLP.parseDependency(sentence.text).word
                    if entity_type == 'sub':
                        entitis.append(get_hanlp_sub_entity(words))
                    if entity_type == 'obj':
                        entitis.append(get_hanlp_obj_entity(words))
                    if entity_type == 'predicate':
                        entitis.append(get_hanlp_predicate_entity(words))
        # 文章得分降序前n个句子
        if sentence_type == 'score':
            for i, idx in enumerate(prep_article.descend_sentence_index):
                if i < sentence_count:
                    words = HanLP.parseDependency(
                        prep_article.sentences[idx].text).word
                    if entity_type == 'sub':
                        entitis.append(get_hanlp_sub_entity(words))
                    if entity_type == 'obj':
                        entitis.append(get_hanlp_obj_entity(words))
                    if entity_type == 'predicate':
                        entitis.append(get_hanlp_predicate_entity(words))

    entity_weight_dict = calculate_weight(entitis)
    return entity_weight_dict

Пример #7

0

Показать файл

Файл: entity.py Проект: miserrman/policy-knowledge-graph

def dependency_analysis(sent):
    result = HanLP.parseDependency(sent)
    ROOT, SUBJECT, PREDICATE = '核心关系', '主谓关系', '宾'
    res = dict()
    key = ['root', 'sub', 'pre', 'sub_adj', 'pre_adj', 'entity']
    for word in result.iterator():
        type = str(word.DEPREL)
        if type.find(ROOT) >= 0:
            res['root'] = word.LEMMA
        elif type.find(SUBJECT) >= 0:
            res['sub'] = word.LEMMA
        elif type.find(PREDICATE) >= 0:
            res['pre'] = word.LEMMA
    res['entity'] = []
    for word in result.iterator():
        if str(word.CPOSTAG).find('n') >= 0 and str(
                word.CPOSTAG).find('v') < 0:
            res['entity'].append(word.LEMMA)
        if res.get('sub') and str(word.HEAD.LEMMA) == str(res['sub']):
            res['sub_adj'] = res['sub_adj'] + [word.LEMMA] if res.get(
                'sub_adj') else [word.LEMMA]
        else:
            res['pre_adj'] = res['pre_adj'] + [word.LEMMA] if res.get(
                'pre_adj') else [word.LEMMA]
    for k in key:
        res[k] = res.get(k, '空')
        if isinstance(res[k], list):
            res[k] = '|'.join(res[k])
    print(res)
    return res

Пример #8

0

Показать файл

    def split1list(self, sentence):
        line = sentence.strip().decode(
            'utf-8', 'ignore')  # 去除每行首尾可能出现的空格，并转为Unicode进行处理
        line1 = re.sub(
            "[0-9\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+".
            decode("utf8"), " ".decode("utf8"), line)
        #wordList = list(jieba.cut(line1))  # 用结巴分词，对每行内容进行分词
        wordList = HanLP.segment(line1.strip())
        poslist = set()
        for w in wordList:
            length = len(w.word)
            nature = str(w.nature)
            if length < 2 and nature.__contains__('w'):
                continue

            if w.word in self.stopwords:
                preflag = None
                continue

            #if self.isFormWord(nature):
            #    continue

            #wordpos = w.word + '   ' + nature
            #self.wordposlist.append(wordpos)

            poslist.add(w.word)

        return poslist

Пример #9

0

Показать файл

    def pinyin(self, sentence):
        pinyinlist = HanLP.convertToPinyinList(sentence)
        res = []
        for pinyin in pinyinlist:
            res.append(str(pinyin))

        return ''.join(res)

Пример #10

0

Показать файл

Файл: recall_and_rank.py Проект: ProHuper/enter_recall

def get_keywords(query, par_dict, sim_dic):
    _words = HanLP.segment(query)
    temp = []
    added = []
    keywords = []
    visited = set()

    for word in _words:
        _word = word.word
        nature = str(word.nature)
        if _word in SAVED:
            temp.append(_word)
        elif nature in ['vn', 'vi']:
            temp.append(_word)
        elif nature == 'v' and _word in V_SET:
            temp.append(_word)
        elif nature in ['n', 'ng', 'nh', 'nhd', 'nl', 'nm', 'nz', 'nba'
                        ] and _word not in FIL_SET and len(_word) > 1:
            temp.append(_word)
    for item in temp:
        added.append((item, 1.5))
        if item in par_dict:
            added.append((par_dict[item], 1))
        if item in sim_dic[0]:
            for sim in sim_dic[1][sim_dic[0][item]]:
                added.append((sim, 1))
    for item in added:
        if item[0] not in visited:
            keywords.append(item)
            visited.add(item)
    return keywords

Пример #11

0

Показать файл

Файл: main.py Проект: NICE-FUTURE/simple-movie-QA-system

def get_abstract_sentence(sentence, vocabulary):
    '''
    句子抽象化
    电影名 nm
    演员名 nnt
    电影类型 ng
    紧跟演员名之后的演员名 nnr
    评分 x
    '''
    abstract_sentence = []
    query_dict = {}
    second = False
    for segment in HanLP.segment(sentence):
        word = str(segment.word)
        nature = str(segment.nature)
        if nature == "nm":
            query_dict["nm"] = word
            word == "nm"
        elif nature == "nnt" and not second:
            query_dict["nnt"] = word
            word == "nnt"
            second = True
        elif nature == "ng":
            query_dict["ng"] = word
            word = "ng"
        elif nature == "m":
            query_dict["x"] = word
            word = "x"
        elif nature == "nnt" and second:
            query_dict["nnr"] = word
            word = "nnr"
            second = False
        if word in vocabulary:
            abstract_sentence.append(word)
    return abstract_sentence, query_dict

Пример #12

0

Показать файл

 def __iter__(self):
     """make each sentence a new line"""
     normed_sent = preprocess(self.strings)
     for sent in split_iter(normed_sent, self.eos_placement):
         sent = ''.join(sent)
         if sent:
             yield list(term.word for term in HanLP.segment(sent))

Пример #13

0

Показать файл

Файл: word_segmentation.py Проект: Architect1st/STC-Conditional-SeqGAN-Pytorch

def ws(filename, convert2zh=False):

    if not os.path.exists(REPO_DIR):
        os.makedirs(REPO_DIR)

    file = os.path.join(REPO_DIR, filename)
    fw = codecs.open(file + '.seg.sc', 'w', encoding = 'utf-8')
    
    regex = re.compile(r'[\u4e00-\u9fffa-zA-Z0-9]+')

    with codecs.open(file, 'r', encoding = 'utf-8') as fr:
        for line in fr:
            line = line.split('\t', 1)[1].strip().replace('“', '').replace('”', '')
            line = clean(line)
            _list = regex.findall(line.strip())
            seq = ''
            for span in _list:
                result = analyzer.analyze(span)
                for terms in result.toSimpleWordList():
                    field = terms.toString().split('/')
                    word = field[0] if not convert2zh else HanLP.convertToTraditionalChinese(field[0])
                    pos = field[1]
                    seq += word.lower() + '_' + pos + ' '

                seq += '，_， '
                    
            fw.write(seq.rsplit('_', 1)[0][:-1] + '。_。\n')

    fw.close()

Пример #14

0

Показать файл

Файл: harvesttext.py Проект: jieliorz/HarvestText

 def dependency_parse(self, sent, standard_name=False, stopwords=None):
     """
     依存句法分析，调用pyhanlp的接口，并且融入了harvesttext的实体识别机制。
     不保证高准确率。
     :param sent:
     :param standard_name:
     :param stopwords:
     :return: arcs：依存弧,列表中的列表。
     [[词语id,词语字面值或实体名(standard_name控制),词性，依存关系，依存子词语id] for 每个词语]
     """
     from pyhanlp import HanLP, JClass
     if not self.hanlp_prepared:
         self.hanlp_prepare()
     self.standard_name = standard_name
     entities_info = self.entity_linking(sent)
     sent2 = self.decoref(sent, entities_info)
     # [word.ID-1, word.LEMMA, word.POSTAG, word.DEPREL ,word.HEAD.ID-1]
     arcs = []
     i = 0
     sentence = HanLP.parseDependency(sent2)
     for word in sentence.iterator():
         word0, tag0 = word.LEMMA, word.POSTAG
         if stopwords and word0 in stopwords:
             continue
         if word0 in self.entity_types:
             if self.standard_name:
                 word0 = entities_info[i][1][0]  # 使用链接的实体
             else:
                 l, r = entities_info[i][0]  # 或使用原文
                 word0 = sent[l:r]
             tag0 = entities_info[i][1][1][1:-1]
             i += 1
         arcs.append([word.ID-1, word0, tag0, word.DEPREL, word.HEAD.ID-1])
     return arcs

Пример #15

0

Показать файл

Файл: pipeline.py Проект: openKG-field/PatentGLUE

def input_pipeline(sentence, lang, bpe=None):
    """
    1. 分词（zh）
    2. 转小写（en）
    3. tokenzie
    4. bpe
    """
    if lang == 'zh':
        seg = [term.word for term in HanLP.segment(sentence)]
        seg_str = ' '.join(seg)
        #print('分词后：', seg)
        mt = MosesTokenizer(lang='zh')
        tokenized_str = mt.tokenize(seg_str, return_str=True)
        #print('tokenize后；',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后：', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    elif lang == 'en':
        lower = sentence.lower()
        #print('小写后：'. lower)
        mt = MosesTokenizer(lang='en')
        tokenized_str = mt.tokenize(lower, return_str=True)
        #print('tokenize后；',tokenized_str)
        if bpe is not None:
            bpe_str = bpe.apply([tokenized_str])[0]
            #print('bpe后：', bpe_str)
            return bpe_str.split()
        return tokenized_str.split()
    else:
        raise Exception

Пример #16

0

Показать файл

def show_words():
    sql = 'SELECT * FROM NEWSWB'
    lock.acquire()
    cursor.execute(sql)
    lock.release()
    news = cursor.fetchone()
    print(news[5], '>>>>>>>', HanLP.extractKeyword(news[5], 5))

Пример #17

0

Показать файл

    def get_sentence_mapping(self, overload=False):
        """
        句子映射表
        :return:    vec_space,如： {'我们是中国人，我们爱自己的祖国':[......], '蜀道难，难于上青天':[......]}
        """
        sentence_to_vec_file = current_path + '/sentence_mapping.pkl'
        if not os.path.isfile(sentence_to_vec_file) or overload:
            print('首次加载句子时间较长，请稍等......')
            sentence_to_vec = {}
            for sentence in self.sentence_list:
                tmp = np.zeros(shape=self.dim)
                index = 0
                for obj in HanLP.segment(sentence):
                    word = obj.word
                    if word in self.char_mapping:
                        tmp += self.char_mapping[word]
                    else:
                        tmp += np.zeros(shape=self.dim)
                    index += 1
                tmp /= index
                sentence_to_vec[sentence] = tmp

                with open(sentence_to_vec_file, 'wb') as f:
                    pickle.dump(sentence_to_vec, f)
                    f.close()

        else:
            with open(sentence_to_vec_file, 'rb') as f:
                sentence_to_vec = pickle.load(f)
                f.close()

        return sentence_to_vec

Пример #18

0

Показать файл

Файл: AddOrInsert.py Проект: LiuKax/hanlp_tool

def add_to_dictionary(word, part, mod=0):
    result = CustomDictionary.add(word, part)
    if not result and mod:
        CustomDictionary.insert(word, part)
    text = "我用天猫交社保"
    print(HanLP.segment(text))
    return result

Пример #19

0

Показать файл

Файл: segements.py Проект: oushu1zhangxiangxuan1/HolmesNER

def raw_seg():
    """
    newSegment()支持下列多种模式，默认使用viterbi
    维特比 (viterbi)：效率和效果的最佳平衡。也是最短路分词，HanLP最短路求解采用Viterbi算法
    双数组trie树 (dat)：极速词典分词，千万字符每秒（可能无法获取词性，此处取决于你的词典）
    条件随机场 (crf)：分词、词性标注与命名实体识别精度都较高，适合要求较高的NLP任务
    感知机 (perceptron)：分词、词性标注与命名实体识别，支持在线学习
    N最短路 (nshort)：命名实体识别稍微好一些，牺牲了速度
    """
    seg = HanLP.newSegment()
    for st in sentences:
        print(seg.seg(st))

    seg_crf = HanLP.newSegment("crf")
    for st in sentences:
        print(seg_crf.seg(st))
    """

Пример #20

0

Показать файл

def get_keyword(content,keynum=2):
    """
    获取每个问题中的关键字,关键词的数目由keynum控制
    :param content: 一个句子
    :return:
    """
    keywordList = HanLP.extractKeyword(content,keynum)
    return keywordList

Пример #21

0

Показать файл

Файл: pLSA.py Проект: IrelandC/DataScience-1

 def segment(self, text):
     word_tag_list = HanLP.segment(text)
     word_list = []
     for word_tag in word_tag_list:
         word, tag = str(word_tag).split('/')
         if tag=='n':
             word_list.append(word)
     return word_list

Пример #22

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def convertToSimplifiedChinese(traditionalChineseString):
    """
     * 繁转简
     *
     * @param traditionalChineseString 繁体中文
     * @return 简体中文
    """
    return HanLP.convertToSimplifiedChinese(traditionalChineseString)

Пример #23

0

Показать файл

Файл: test_custom_dict.py Проект: HBU/NLP

 def test_custom_dict_forcing(self):
     segment = HanLP.newSegment('viterbi')
     CustomDictionary.insert('川普', 'nr 1')
     self.assertIn('四川/ns, 普通人/n, 与/cc, 川/b, 普通/a, 电话/n',
                   segment.seg('四川普通人与川普通电话').__str__())
     segment.enableCustomDictionaryForcing(True)
     self.assertIn('四川/ns, 普通人/n, 与/cc, 川普/nr, 通电话/vi',
                   segment.seg('四川普通人与川普通电话').__str__())

Пример #24

0

Показать файл

 def load_data(self, file):
     result = []
     with open(file, mode='r', encoding="utf-8") as fp:
         lines = fp.readlines()
         for line in lines:
             words = HanLP.segment(str(line).strip())
             result.append(" ".join([str(i.word) for i in words]))
     return result

Пример #25

0

Показать файл

Файл: __init__.py Проект: ayiis/coding

def hanlp_recognize(text):

    # segment = HanLP.newSegment().enableNameRecognize(True)
    # segment = HanLP.newSegment().enableTranslatedNameRecognize(True)
    # segment = HanLP.newSegment().enablePlaceRecognize(True)
    segment = HanLP.newSegment().enableOrganizationRecognize(True)
    term_list = segment.seg(text)
    print(term_list)

Пример #26

0

Показать файл

Файл: data_process.py Проект: swz1216470093/CQA_experts_dentifying

def segment(text):
    '''
    使用HanLP对中文句子进行分词
    '''
    try:
        seg_result = hanlp.segment(text)
        return [term.word for term in seg_result]
    except Exception:
        return text.split()

Пример #27

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def parseDependency(sentence):
    """
     * 依存文法分析
     *
     * @param sentence 待分析的句子
     * @return CoNLL格式的依存关系树

    """
    return HanLP.parseDependency(sentence)

Пример #28

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def convertToPinyinList(text):
    """
     * 转化为拼音
     *
     * @param text 待解析的文本
     * @return 一个拼音列表

    """
    return HanLP.convertToPinyinList(text)

Пример #29

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def convertToTraditionalChinese(simplifiedChineseString):
    """
     * 简转繁
     *
     * @param simplifiedChineseString 简体中文
     * @return 繁体中文

    """
    return HanLP.convertToTraditionalChinese(simplifiedChineseString)

Пример #30

0

Показать файл

Файл: api.py Проект: wisonwang/pyhanlp

def extractKeyword(document, size):
    """
    * 提取关键词
     *
     * @param document 文档内容
     * @param size     希望提取几个关键词
     * @return 一个列表
    """
    return HanLP.extractKeyword(document, size)

Python HanLP примеры использования