def gen_pinyin2hanzi(pinyin, num=5):
    results = {}
    hmmparams = DefaultHmmParams()
    result = viterbi(hmm_params=hmmparams,
                     observations=pinyin,
                     path_num=num,
                     log=True)
    for item in result:
        # results.setdefault(''.join(item.path),0)
        results[''.join(item.path)] = item.score
    return results
示例#2
0
def top_k_transform(importance_score, list_of_texts, porpotion, new_word_dictionary, black_list_word):

    hmmparams = DefaultHmmParams() # HMM pinyin2hanzi
    

    target_text = list_of_texts
    target_text = tokenize(target_text).split(' ')
    k = int(len(target_text)*porpotion) + 1
    top_k_score = heapq.nlargest(k, importance_score)
    top_k_score_index = [importance_score.index(score) for score in top_k_score]

    for index in top_k_score_index:
        # make a virables repsent modified list_of_text
        gedit_text = copy.deepcopy(list_of_texts)
        if(is_Chinese(target_text[index])):
            pinyin_of_target_text = lazy_pinyin(target_text[index])
            if pinyin_of_target_text == ['ni']:
                pinyin_of_target_text = random.choice([['li'], ['ni']])
            if pinyin_of_target_text == ['ta']:
                pinyin_of_target_text = random.choice([['ta'], ['te']])
            if pinyin_of_target_text == ['cao']:
                pinyin_of_target_text = random.choice([['ca'], ['cao']])
            if pinyin_of_target_text == ['ma']:
                pinyin_of_target_text = random.choice([['me'], ['ma']])
            if pinyin_of_target_text == ['si']:
                pinyin_of_target_text = random.choice([['shi'], ['si']])
            try:
                #pinyin to other Chinese
                hanzi_of_target_test = viterbi(hmm_params=hmmparams, observations=pinyin_of_target_text, path_num = 10)

                # choose a word randly
                # target_text[index] = ''.join(random.choice(hanzi_of_target_test).path)


                # caculate the similarity between original word and transferable word
                # use greedy algorithm
                m_destination_word = calculate_similarity(target_text, index, list_of_texts[i], hanzi_of_target_test, gedit_text, black_list_word)

                target_text[index] = m_destination_word
                list_of_texts = ''.join(target_text)
                # 加入新词字典
                temp = new_word_dictionary.get(m_destination_word,0)
                temp += 1
                # 如果这个新词已经出现了10次,那么把它加到黑名单里
                if(temp < 20):
                    new_word_dictionary[m_destination_word] = temp
                else:
                    new_word_dictionary.pop(m_destination_word)
                    black_list_word.append(m_destination_word)
            except:
                pass
        else:
            continue
    return list_of_texts
 def __init__(self):
     # 初始化
     self.hmmparams = DefaultHmmParams()
     self.dagparams = DefaultDagParams()
     self.result = ''
     self.shengmu = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'w', 'x', 'y',
                     'z', 'ch', 'sh', 'zh']
     self.yy = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'en', 'eng', 'er', 'o', 'ou', 'ong']
     self.ym_b = ["a", "ai", "an", "ang", "ao", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "o", "u"]
     self.ym_c = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"]
     self.ym_d = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ia", "ian", "iao", "ie", "ing", "iu",
                  "ong", "ou", "u", "uan", "ui", "un", "uo"]
     self.ym_f = ["a", "an", "ang", "ei", "en", "eng", "iao", "o", "ou", "u"]
     self.ym_g = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "ong", "ou", "u", "uai", "uan", "uang", "ui",
                  "un", "uo"]
     self.ym_h = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "ong", "ou", "u", "ua", "uai", "uan", "uang",
                  "ui", "un", "uo"]
     self.ym_j = ["i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"]
     self.ym_k = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "ong", "ou", "u", "ui", "un", "uo"]
     self.ym_l = ["a", "ai", "an", "ang", "ao", "e", "ei", "eng", "i", "ia", "ian", "iao", "ie", "in", "ing", "iu",
                  "o", "ong", "ou", "u", "uan", "un", "uo", "v", "ve"]
     self.ym_m = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "iu",
                  "o", "ou", "u"]
     self.ym_n = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iang", "iao", "ie", "in", "ing",
                  "iu", "ong", "ou", "u", "uan", "un", "uo", "v", "ve"]
     self.ym_p = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "o",
                  "ou", "u"]
     self.ym_q = ["i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"]
     self.ym_r = ["an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "ua", "uan", "ui", "un", "uo"]
     self.ym_s = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"]
     self.ym_t = ["a", "ai", "an", "ang", "ao", "e", "ei", "eng", "i", "ian", "iao", "ie", "ing", "ong", "ou", "u",
                  "uan", "ui", "un", "uo"]
     self.ym_w = ["a", "ai", "an", "ang", "ei", "en", "eng", "o", "u"]
     self.ym_x = ["i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"]
     self.ym_y = ["a", "an", "ang", "ao", "e", "i", "in", "ing", "o", "ong", "ou", "u", "uan", "ue", "un"]
     self.ym_z = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un",
                  "uo"]
     self.ym_ch = ["a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "ua", "uai", "uan", "uang",
                   "ui", "un", "uo"]
     self.ym_sh = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ou", "u", "ua", "uai", "uan", "uang",
                   "ui", "un", "uo"]
     self.ym_zh = ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "ua", "uai", "uan",
                   "uang", "ui", "un", "uo"]
     self.ym = [self.yy, self.ym_b, self.ym_c, self.ym_d, self.ym_f, self.ym_g, self.ym_h, self.ym_j, self.ym_k,
                self.ym_l, self.ym_m, self.ym_n, self.ym_p, self.ym_q, self.ym_r, self.ym_s, self.ym_t, self.ym_w,
                self.ym_x, self.ym_y, self.ym_z, self.ym_ch, self.ym_sh, self.ym_zh
                ]
示例#4
0
 def __init__(self, useWhat='pickle', backen=False):
     global backenFlag
     self.fileOK = False
     self.useWhat = useWhat
     self.DagsPath = None
     self.database = None
     self.pickle = None
     self.backen = backen
     if self.backen:
         try:
             from Pinyin2Hanzi import DefaultHmmParams
             from Pinyin2Hanzi import viterbi
             self.hmmparams = DefaultHmmParams()
             self.viterbi = viterbi
         except:
             raise Exception('lost Pinyin2Hanzi package,'
                             'please find that package ,'
                             'or set backen=False')
     self.CheckFiles()
示例#5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pypinyin import pinyin, lazy_pinyin, Style
from Pinyin2Hanzi import DefaultHmmParams
from Pinyin2Hanzi import viterbi

txt = u'锄禾日当午'

py = lazy_pinyin(txt)

hmmparams = DefaultHmmParams()
result = viterbi(hmm_params=hmmparams, observations=py, path_num = 1)
for item in result:
  txt_rtn = u''.join(item.path)

if txt == txt_rtn:
  print u'OK'
else:
  print u'Error: %s -> %s -> %s' % (txt, py, txt_rtn)
示例#6
0
# -*- coding:UTF-8 -*-
# Create time: 2019-12-20
# Code by: hjfzzm
import socket
import threading
from Pinyin2Hanzi import DefaultHmmParams
from Pinyin2Hanzi import is_pinyin
from Pinyin2Hanzi import simplify_pinyin
from Pinyin2Hanzi import viterbi

# 全局变量
re_string = ""
# host = '172.17.228.35'
host = '127.0.0.1'
port = 10086
hmm = DefaultHmmParams()


class PyTrieNode(object):
    def __init__(self, key="", seq=None):
        if seq is None:
            seq = []
        self.key = key
        self.end = len(seq) == 0
        self.children = {}
        if len(seq) > 0:
            self.children[seq[0]] = PyTrieNode(seq[0], seq[1:])

    def add(self, seq):
        if len(seq) == 0:
            self.end = True
示例#7
0
def transform(line, tf_idf_score, new_word_dictionary, black_list_word):
    """转换一行文本。

    :param line: 对抗攻击前的输入文本
    :type line: str
    :returns: str -- 对抗攻击后的输出文门
    """
    # 修改以下逻辑
    from preprocessing_module import preprocess_text

    preprocess_text(line)
    # 选择修改文本的比例
    a = random.choice([1, 0, 2, 5, 4])
    if a >= 6:
        return line
    hmmparams = DefaultHmmParams()  # HMM pinyin2hanzi


    #进行重要度排序,得出每个词的辱骂性质的分数
    imp_score = importance(line, tf_idf_score)


    #修改一定比例的词语, 当比例为0时,最低为一个
    out_line = top_k_transform(imp_score, line, 0, new_word_dictionary, black_list_word)
    out_line = "".join(out_line)
    out_line = out_line.replace('\n', '')
    m_line = tokenize(out_line)

    _list_m_line = []
    for _word in m_line:
        _list_m_line.append(_word)

    #将“你”这个字进行替换

    for i, m_word in enumerate(m_line):
        if m_word in important_words:
            hanzi_of_target_test = ''
            pinyin_of_target_text = lazy_pinyin(m_word)
            if pinyin_of_target_text == ['ni']:
                hanzi_of_target_test = dict_word['ni']
            else:
                continue
            m_destination_word = m_word
            # pinyin to other Chinese
            nums_circle = 0
            #选择一个汉字原始汉字不同且不在黑名单里
            while nums_circle <= 50:
                nums_circle += 1
                m_destination_word = random.choice(hanzi_of_target_test)
                if m_destination_word != m_word and m_destination_word not in black_list_word:
                    break
                else:
                    continue
            _list_m_line[i] = m_destination_word

            m_line = ''.join(_list_m_line)

            temp = new_word_dictionary.get(m_destination_word, 0)
            temp += 1
            # 如果这个新词已经出现了30次,那么把它加到黑名单里
            if (temp < 30):
                new_word_dictionary[m_destination_word] = temp
            else:
                new_word_dictionary.pop(m_destination_word)
                black_list_word.append(m_destination_word)
    out_line = m_line.split()
    out_line = ''.join(out_line)
    _line = out_line
    str_dot = ''

    #求出最起始比例
    _ori_pro = reference_model(model, _line)
    _nums = 0

    #在句子末尾加逗号,至多50个,(当前概率-原始概率)/原始概率>0.8时停止
    for i in range(50):
        _line += ','
        _nums += 1
        _pre_pro = reference_model(model, _line)
        if abs(_pre_pro - _ori_pro)/_ori_pro > 0.8:
            break
    out_line = _line + str_dot
    print('outline,', out_line)
    return out_line