Exemplo n.º 1
0
def jinyi():

    word = request.args.get('word')

    if request.method == 'POST':
        word = request.form['word']

    if word == None:
        return '请提供要取近义的文字,参数名为 word '

    str = word
    lstseg = synonyms.seg(str)
    nword = ''
    for i, str1 in enumerate(lstseg[0]):
        # print('%s = %s'%(str1,lstseg[1][i] ),synonyms.nearby(str1) )

        # 忽略不处理的词性
        ignore_cixing = ['x', 'nz', 'nr', 'eng', 'nrfg']

        if lstseg[1][i] in ignore_cixing:
            nstr = str1
        else:
            nstr = _nearby_word(str1)
            if nstr == '':
                nstr = str1
            else:
                irnd = random.randint(1, 10)
                if irnd <= 1:
                    nstr = str1
        nword += nstr

    #print('原句:%s\n新句:%s'%(word,nword))
    return make_response(nword)
Exemplo n.º 2
0
 def fenci(self, s):
     res = synonyms.seg(s)
     if len(res) > 1:
         print("获取分词成功")
         return res[0]
     else:
         print("获取分词失败")
         return None
Exemplo n.º 3
0
def nermatch():  
    keywords['flag']=''
    for i in range(len(keywords)):
        keywords['flag'][i]=[]
        for ii in range(len(keywords['keywords'][i].split(','))):
            words = synonyms.seg(keywords['keywords'][i].split(',')[ii])
            keywords['flag'][i].append(words[1])
            if keywords['flag'][i][ii]=='nt' or keywords['flag'][i][ii]=='nr':
                keywords['entity'][i].append(word.word)    
Exemplo n.º 4
0
def find_synonyms(word):
    """ use various tech to find the synonym for the input "word"

    Args:
        word (string): [description]

    Returns:
        list: [description]
    """
    # import the professions file
    with open('./dataset/profession2.json', 'rb') as jsonfile:
        profession_json = json.load(jsonfile, encoding='utf-8')

    profession_list = []
    prof_kwords_list = []
    for profession in profession_json['data']:
        profession_list.append(profession['name'])
        prof_kwords_list.append(profession['kwords'])

    # find the nearby words in Synonyms module
    nearby_words = synonyms.nearby(word)
    if len(nearby_words[0]):
        s_words = nearby_words[0][0:5]
        print(s_words)

    # if nearby_word not found, using word segmentation(分词)
    else:
        seg_words = []  # 分词
        seg = synonyms.seg(word)
        for i, v in enumerate(seg[1]):
            if v == 'n':
                seg_words.append(seg[0][i])
        s_words = seg_words

    # use kmcha to search synonyms
    km_words = kmcha_crawler_model.kmcha_search((word, '01'))

    # insert kmcha result into already found synonyms
    for word in km_words:
        if word not in s_words:
            s_words.append(word)

    # add synonyms to result if exist in professions lists
    result = []
    for i, profession in enumerate(profession_list):
        prof_kword = prof_kwords_list[i]
        for seg in s_words:
            # if seg == profession or seg in profession: # fully match
            if seg == profession or seg == prof_kword:  # partly match only with kwords
                result.append(profession)
    return result
Exemplo n.º 5
0
def generate_vocab(text_data):
    """
        Extract all the noun from dataset
    :param text: list<str>
    :return:
    """

    vocab = set()
    for sentence in text_data:
        segs = synonyms.seg(sentence)
        words = [x for x, y in zip(segs[0], segs[1]) if y == "n"]
        for word in words:
            if len(word) >= 2:
                vocab.add(word)

    if os.path.exists(config.vocab_path):
        os.remove(config.vocab_path)

    logger.info("Writing the vocab...")
    with codecs.open(config.vocab_path, "wb") as f:
        pickle.dump(vocab, f)
    del vocab
    logger.warning("vocab.pkl: %s" % config.vocab_path)
    print("Done!")
Exemplo n.º 6
0
    def text(self, text):

        # text="可以用于自然语言理解的很多任务"
        kws, s = synonyms.seg(text)
        # kws =tkit.Text().get_keywords(text,num=3)
        # l = {'n','nz','ns','nr'}
        # kws_new = []
        keyword = ''
        # for key,item in enumerate(s):
        #     print(key)
        #     print(item)

        #     print(kws[key])
        #     if item in l:
        #         keyword = keyword+" "+kws[key]

        new = ''
        for key, item in enumerate(s):
            # print(key)
            # print(item)

            # print(kws[key])
            # if item in l:
            #     keyword = keyword+" "+kws[key]
            kn, p = synonyms.nearby(kws[key])
            print(kn)
            print(p)
            if len(kn) > 1 and p[1] > 0.8:

                print(kn[1])
                # if p[1]>0.7:
                print('建议选择')
                new = new + kn[1]
            else:
                new = new + kws[key]
        return new
Exemplo n.º 7
0
import synonyms

print(synonyms.seg("中国南海"))
print(synonyms.seg("中南海"))

print("中国:{}".format(synonyms.nearby("人脸")))

print(synonyms.compare("西北","塑料",seg=False))
Exemplo n.º 8
0
def cPrintSeg(str):
    result = synonyms.seg(str)
    print('@+@')
    print('分词为:', result)
    print('@-@')
# import sklearn
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker
# import matplotlib as mpl
# from scipy import interpolate
# from statsmodels.tsa.ar_model import AR

data_long = np.loadtxt("data_test1.csv",str,delimiter=",", skiprows=1)
data_long

("国际劳工组织")
cixing=[]
for i in range(0,len(data_long)):
    cixing.append(synonyms.seg(data_long[i]))




test=synonyms.nearby("人脸")
test[0]
print("识别: %s" % (synonyms.nearby("识别")))
print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST")))
synonyms.display("金融")
synonyms.v()
print(1)


cixiangliang=[]
for i in range(0,len(data_long)):
Exemplo n.º 10
0
 def fenci(self, s):
     res = synonyms.seg(s)
     if len(res) > 1:
         return res[0]
     else:
         return None
Exemplo n.º 11
0
 def test_wordseg(self):
     print("test_wordseg")
     print(synonyms.seg("中文近义词工具包"))
Exemplo n.º 12
0
def aug_df(reviews_df, labels_df, op, n=3):
	for idx in reviews_df.index:
		id = reviews_df.loc[idx, 'id']
		rv = reviews_df.loc[idx, 'Reviews']
		for i in reversed(range(len(rv))):
			if rv[i].strip() == '':
				for j in labels_df[labels_df['id'] == id].index:
					lb = labels_df[labels_df['id'] == id].loc[j]
					a_s = lb['A_start'].strip()
					a_e = lb['A_end'].strip()
					if a_s != '' and a_e != '':
						a_s = int(a_s)
						a_e = int(a_e)
						if a_s > i:
							a_s -= 1
							a_e -= 1
							labels_df.loc[j, 'A_start'] = str(a_s)
							labels_df.loc[j, 'A_end'] = str(a_e)
					o_s = lb['O_start'].strip()
					o_e = lb['O_end'].strip()
					if o_s != '' and o_e != '':
						o_s = int(o_s)
						o_e = int(o_e)
						if o_s > i:
							o_s -= 1
							o_e -= 1
							labels_df.loc[j, 'O_start'] = str(o_s)
							labels_df.loc[j, 'O_end'] = str(o_e)

		rv = rv.replace(' ', '')

		still_spans = []
		for i in labels_df[labels_df['id'] == id].index:
			lb = labels_df.loc[i]
			a_s = lb['A_start'].strip()
			a_e = lb['A_end'].strip()
			if a_s != '' and a_e != '':
				still_spans.append((int(a_s), int(a_e)))
			o_s = lb['O_start'].strip()
			o_e = lb['O_end'].strip()
			if o_s != '' and o_e != '':
				still_spans.append((int(o_s), int(o_e)))

		still_spans.sort(key=lambda x: x[0])

		rv_tokens = synonyms.seg(rv)[0]
		editable_tokens = []
		editable_spans = []
		cur = 0
		for i in range(len(rv_tokens)):

			end = cur + len(rv_tokens[i])
			editable = True
			for span in still_spans:
				if is_intersec(cur, end, span[0], span[1]):
					editable = False
					break
			if editable and (rv_tokens[i] not in [',', ',', '!', '。', '*', '?', '?']):
				editable_spans.append((cur, end))
				editable_tokens.append(rv_tokens[i])
			cur = end

		if not editable_tokens:
			continue

		rv_list = list(rv)
		if op == 'delete' or op == 'replace' or op == 'insert':
			to_edit = sorted(np.random.choice(range(len(editable_tokens)), size=min(len(editable_tokens), n), replace=False),
				reverse=True)
			for ii in to_edit:
				span = editable_spans[ii]
				token = editable_tokens[ii]
				if op == 'delete' or op == 'replace':
					left, right = span
					if op == 'delete':
						target_token = ''
					else:
						candi, probs = synonyms.nearby(token)
						if len(candi) <= 1:
							target_token = ''
						else:
							probs = np.array(probs[1:]) / sum(probs[1:])
							target_token = np.random.choice(candi[1:], p=probs)
				else:
					left, right = span[-1], span[-1]
					token = ''
					candi, probs = synonyms.nearby(editable_tokens[ii])
					if len(candi) <= 1:
						target_token = ''
					else:
						probs = np.array(probs[1:]) / sum(probs[1:])
						target_token = np.random.choice(candi[1:], p=probs)

				shift = len(target_token)-len(token)

				for i in labels_df[labels_df['id'] == id].index:
					lb = labels_df.loc[i]
					a_s = lb['A_start'].strip()
					a_e = lb['A_end'].strip()
					if a_s != '' and a_e != '':
						a_s = int(a_s)
						a_e = int(a_e)
						if a_s >= span[-1]:
							a_s += shift
							a_e += shift
							labels_df.loc[i, 'A_start'] = str(a_s)
							labels_df.loc[i, 'A_end'] = str(a_e)
					o_s = lb['O_start'].strip()
					o_e = lb['O_end'].strip()
					if o_s != '' and o_e != '':
						o_s = int(o_s)
						o_e = int(o_e)
						if o_s >= span[-1]:
							o_s += shift
							o_e += shift
							labels_df.loc[i, 'O_start'] = str(o_s)
							labels_df.loc[i, 'O_end'] = str(o_e)
				print(token)
				print(''.join(rv_list[:left]), ''.join(rv_list[right:]))
				rv_list = rv_list[:left] + list(target_token) + rv_list[right:]

		elif op == 'swap':
			cur_time = 0
			if len(editable_tokens) < 2:
				continue
			if len(editable_tokens) == 2:
				time = 1
			else:
				time = n
			while cur_time != time:
				idx0, idx1 = sorted(np.random.choice(range(len(editable_tokens)), size=2, replace=False))
				token0, token1 = editable_tokens[idx0], editable_tokens[idx1]
				span0, span1 = editable_spans[idx0], editable_spans[idx1]
				print(token0, token1)
				editable_tokens[idx0], editable_tokens[idx1] = token1, token0
				if len(token0) != len(token1):
					shift = len(token1) - len(token0)
					editable_spans[idx0] = (span0[0], span0[0]+len(token1))
					editable_spans[idx1] = (span1[0]+shift, span1[0] + shift + len(token0))

					for idx_edt in range(len(editable_tokens)):
						cur_span = editable_spans[idx_edt]
						if cur_span[0] >= span0[1] and cur_span[1] <= span1[0]:
							editable_spans[idx_edt] = (cur_span[0]+shift, cur_span[1]+shift)

					for i in labels_df[labels_df['id'] == id].index:
						lb = labels_df.loc[i]
						a_s = lb['A_start'].strip()
						a_e = lb['A_end'].strip()
						if a_s != '' and a_e != '':
							a_s = int(a_s)
							a_e = int(a_e)
							if a_s >= span0[1] and a_e <= span1[0]:
								a_s += shift
								a_e += shift
								labels_df.loc[i, 'A_start'] = str(a_s)
								labels_df.loc[i, 'A_end'] = str(a_e)
						o_s = lb['O_start'].strip()
						o_e = lb['O_end'].strip()
						if o_s != '' and o_e != '':
							o_s = int(o_s)
							o_e = int(o_e)
							if o_s >= span0[1] and o_e <= span1[0]:
								o_s += shift
								o_e += shift
								labels_df.loc[i, 'O_start'] = str(o_s)
								labels_df.loc[i, 'O_end'] = str(o_e)

				rv_list = rv_list[:span0[0]] + list(token1) + rv_list[span0[1]: span1[0]] + list(token0) + rv_list[span1[1]:]

				cur_time += 1

		rv_new = ''.join(rv_list)
		reviews_df.loc[idx, 'Reviews'] = rv_new
		print(rv)
		print(rv_new)
		print(labels_df[labels_df['id'] == id])

	return reviews_df, labels_df
Exemplo n.º 13
0
def segment(text):
    """
    分词
    """
    return synonyms.seg(text)
Exemplo n.º 14
0
# #
# # seg_snownlp = SnowNLP(content)
# # print("/".join(seg_snownlp.words))
#
# import pkuseg
# print("*****pkuseg******")
# pku = pkuseg.pkuseg()
#
# seg_pku = pku.cut(content)
# print("/".join(seg_pku))
#
# print("*****thulac******")
# import thulac
# thu_lac = thulac.thulac(seg_only=True)
# thu_seg = thu_lac.cut(content, text=True)
# print("/".join(thu_seg))
#
# print("*****hanlp******")
# from pyhanlp import HanLP
# seg_hanlp = HanLP.segment(content)
# print("/".join([term.word for term in seg_hanlp]))

print("*****synonyms******")
import synonyms
content = '解放军信息工程大学网络空间安全学院'
syn_seg = synonyms.seg(content)
print(syn_seg)

sen1 = '漏洞挖掘'
sen2 = '漏洞检测'
print(synonyms.compare(sen1, sen2, seg=True))
Exemplo n.º 15
0
import numpy as np
import jieba
import synonyms
import random
similar_question = '执行案件在财务系统获取不案件信息'
cut_list = list(jieba.cut(similar_question))
print(cut_list)
# while (True):
#     idx = random.randint(0, len(cut_list) - 1)
#     if len(synonyms.nearby(cut_list[idx])[0]) >= 2:
#         print(cut_list[idx])
#         change_word=synonyms.nearby(cut_list[idx])[0][1]
#         break
# print(change_word)
print(synonyms.seg('执行案件在财务系统获取不案件信息'))
print(synonyms.nearby('执行'))
list1 = ['执行', '继续执行', '督导', '指派', '可执行', '执行者', '监督', '制订', '分派', '拒绝执行']
list1.pop(2)
print(list1)
Exemplo n.º 16
0
 writer = csv.writer(expand)
 with open('final_shuffle.csv', 'r', encoding='gbk') as all:
     reader = csv.reader(all)
     part_rand = []
     stand = ''
     for line in reader:
         # 将同个标准问的随机问添加到一个数组
         # 如果是最后一个标准问会跳过同义词扩充部分,所以需要在文件最后一行随便加一行不同标准问的数据
         if line[1] == stand:
             part_rand.append(line[0])
         else:
             for rand in part_rand:
                 # 将原句写入
                 writer.writerow([rand, stand])
                 # 句子分词
                 cut_word = synonyms.seg(rand)
                 syns = []
                 # 获取每个词的十个同义词,添加到数组
                 for word in cut_word[0]:
                     syn = synonyms.nearby(word)
                     syns.append(syn[0])
                 # 由已有数据量确定每个句子扩充几遍
                 # for i in range(250//len(part_rand)):
                 for i in range(1):
                     new = ''
                     # 遍历每个词,增加替换概率,0.6不变、0.2替换、0.2unk
                     for index, word in enumerate(cut_word[0]):
                         syn = syns[index]
                         k = random.randint(0, 9)
                         if k in range(6):
                             new += word
Exemplo n.º 17
0
def data_enforce_(label_file, review_file):
    """数据增强: 以0.3 的概率对样本进行替换"""
    columns_1 = "id,AspectTerms,A_start,A_end,OpinionTerms,O_start,O_end,Categories,Polarities".split(
        ",")
    columns_2 = "id,Reviews".split(",")
    df_labels = pd.read_csv(open(label_file, encoding="utf-8"),
                            header=0)[columns_1]
    df_reviews = pd.read_csv(open(review_file, encoding="utf-8"),
                             header=0)[columns_2]
    df_reviews.Reviews = [re.sub("\s+", ",", v) for v in df_reviews.Reviews]
    print(df_labels[:3])
    print(df_reviews[:3])
    res_1 = []
    res_2 = []
    count = 0
    for _ in range(20):
        # for _ in range(3):  # test
        print(_)
        for row_re in df_reviews.values:
            count += 1
            # logger.info(count)
            is_fake = False
            change_type = ""

            # 随机选择一个label行进行更改
            rows_la = df_labels[df_labels.id == row_re[0]].values.copy()
            one_index = random.randint(0, len(rows_la) - 1)
            # print(rows_la)
            row_la = rows_la[one_index]
            # print(row_la)
            row_label = list(row_la)
            row_review = list(row_re)
            if row_label[1] != "_":
                # AspectTerms 随机替换
                aspect = row_label[1]
                # 对于置信度大于0.4 的均作为备选
                aspect_syn = [
                    word for word, _ in zip(*synonyms.nearby(aspect))
                    if _ > 0.4
                ]
                if uniform() < 0.5 and aspect_syn:
                    # 随机选出一个替换
                    aspect_replace = random.choice(aspect_syn)
                    row_label[1] = aspect_replace
                    row_review[1] = row_review[1].replace(
                        aspect, aspect_replace)
                    is_fake = True
                    change_type += "+" + "替换aspect"

            if row_label[4] != "_":
                # 情感 随机替换
                opinion = row_label[4]
                # print(synonyms.nearby(opinion))
                opinion_syn = [
                    word for word, _ in zip(*synonyms.nearby(opinion))
                    if _ > 0.4
                ]
                if uniform() < 0.5 and opinion_syn:
                    opinion_replace = random.choice(opinion_syn)
                    row_label[4] = opinion_replace
                    row_review[1] = row_review[1].replace(
                        opinion, opinion_replace)
                    is_fake = True
                    change_type += "+" + "替换opinion"

            # 经过上面两次处理,被替换的概率低于0.49

            if uniform() < 0.1:
                # 以较低的概率 对aspect 和opinion的位置进行交换
                if row_label[1] != "_" and row_label[4] != "_":
                    # 对标签位置进行更改
                    tmp = row_label[2]
                    row_label[2] = row_label[5]
                    row_label[5] = tmp
                    tmp = row_label[3]
                    row_label[3] = row_label[6]
                    row_label[6] = tmp
                    # 对文本进行更改
                    row_review[1] = row_review[1] \
                        .replace(row_label[1], row_label[4]) \
                        .replace(row_label[4], row_label[1])
                    is_fake = True
                    change_type += "+" + "交换item"

            # 经过上面的操作被 变换的可能为低于0.54

            if uniform() < 0.3:
                # 随机替换1-3个词汇个词汇
                seg_words = synonyms.seg(row_review[1].replace(
                    row_label[1], "@").replace(row_label[4], "@"))[0]
                num = random_pick([1, 2, 3], [0.7, 0.25, 0.05])
                # print(seg_words, num)
                god_words = np.random.choice(seg_words,
                                             min(num, len(seg_words)),
                                             replace=False)
                for god_word in god_words:
                    if god_word != "@" and god_word != "&":
                        tmp = synonyms.nearby(god_word)[0]
                        if tmp and god_word not in row_review[
                                1] and god_word not in row_review[4]:
                            # 同义词替换
                            row_review[1] = row_review[1].replace(
                                god_word, random.choice(tmp))
                            is_fake = True
                change_type += "+" + "替换其他"

            # 经过上面的处理, 该样本为生成样本的概率低于0.74

            if uniform() < 0.1:
                # 随机交换两个词汇
                seg_words = synonyms.seg(row_review[1].replace(
                    row_label[1], "@").replace(row_label[4], "@"))
                if len(seg_words) > 5:
                    tmp = np.random.choice(seg_words, 2, replace=False)
                    row_review[1] = row_review[1].replace(tmp[0],
                                                          tmp[1]).replace(
                                                              tmp[1], tmp[0])
                    is_fake = True
                    change_type += "+" + "交换其他词汇"

            if uniform() < 0.3:
                # 随机删除一个字符
                char_index = random.randint(0, len(row_review[1]) - 1)
                if row_review[1][char_index] not in {
                        v
                        for v in (row_label[1] + row_label[4])
                }:
                    row_review[1] = row_review[1][:char_index] + row_review[1][
                        char_index + 1:]
                    is_fake = True
                    change_type += "+" + "删除"

            # 经过前面的处理增强样本占比低于0.9

            # 序列编号id
            rows_la[one_index] = np.array(row_label)
            for v in range(len(rows_la)):
                rows_la[v][0] = count
            row_review[0] = count
            row_review[1] = re.sub("\s+", ",", row_review[1])
            # logger.info(rows_la)

            res_1.extend(rows_la)
            res_2.append(row_review + [is_fake, change_type, row_re[-1]])

    pd.DataFrame(data=res_1, columns=columns_1).to_csv(
        "zhejiang/enforce_data/train_labels_enforce.csv",
        index=False,
        encoding="utf-8")
    pd.DataFrame(data=res_2,
                 columns=columns_2 +
                 ["is_fake", "change_type", "original_review"]).to_csv(
                     "zhejiang/enforce_data/train_reviews_enforce.csv",
                     index=False,
                     encoding="utf-8")
Exemplo n.º 18
0
# coding:utf-8
import synonyms
print(synonyms.seg("能量"))
'''
中文近义词工具包。支持自然语言理解的很多任务:文本对齐、推荐算法、相似度计算、语义偏移、关键字提取、概念提取、自动摘要、搜索引擎等

输出结果:
(['能量'], ['n'])
'''
Exemplo n.º 19
0
#!/usr/bin/python
# -*- coding: utf8 -*-

import synonyms
print("人脸")
result = synonyms.nearby("人脸")
words = result[0]
scores = result[1]
print(words.__len__())
print(scores.__len__())
for i in range(0, words.__len__()):
    print words[i], "=", scores[i]

result1 = synonyms.seg("我是中国人")
words1 = result1[0]
tags = result1[1]

for i in range(0, words1.__len__()):
    print words1[i], "->", tags[i]

r = synonyms.compare('商贸城', '贸易', seg=True)
print r
Exemplo n.º 20
0
# @Software: PyCharm
# @Time    : 2020-12-14 10:52
# @Author  : Super-Zhang
# @Description :
import synonyms
print(synonyms.seg("中文近义词工具包"))

# 分词结果,由两个 list 组成的元组,分别是单词和对应的词性。
# (['中文', '近义词', '工具包'], ['nz', 'n', 'n'])


print("交换: ", synonyms.nearby("交换"))

print("交换: ", synonyms.nearby("交换"))
print("两数: ", synonyms.nearby("两个数"))

# sen1 = "两数交换"
# sen2 = "a与b交换"
# r = synonyms.compare(sen1, sen2, seg=True)
# print(r)
print("=======语义相似度计算==========")

# 名词和动词可以比较好的进行计算。

# print(synonyms.compare("打开主函数", "开启main函数", seg=True))



print("两数交换","交换两个变量", synonyms.compare("两数交换","交换两个变量", seg=True))
print("我爱看中国有嘻哈", "中国有嘻哈是我爱看的节目",synonyms.compare("我爱看中国有嘻哈", "中国有嘻哈是我爱看的节目", seg=True))
print("打开主函数", "开启main函数", synonyms.compare("打开主函数", "开启main函数", seg=True))
Exemplo n.º 21
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 27 14:17:27 2018

@author: madawei1
"""

import synonyms

#中文分词
synonyms.seg("中文近义词工具包")

#nearby
print("破洞: " ,(synonyms.nearby("破洞")))
print("女人: " ,(synonyms.nearby("女人")))
print("NOT_EXIST: " ,(synonyms.nearby("NOT_EXIST")))

sen1 = "快递"
sen2 = "物流"
r = synonyms.compare(sen1, sen2, seg=True)