def __init__(self): # 是否可以加载多个字典文件? """ def load_userdict(path): _DICTIONARY.add_dict(path) """ fool.load_userdict("mydic/hushen_company.txt") fool.load_userdict("mydic/company.dic") fool.load_userdict("mydic/name.dic") fool.load_userdict("mydic/term.dic")
def classify(word, dict): corpus = [] sql = "select * from T_Keywords" results = mysql.select(sql) for category in categories: words = "" for result in results: if result[2] == category: fool.load_userdict(dict) line = " ".join(fool.cut(result[3])[0]) #将每一类的分词拼接成一个字符串 words = words + line corpus.append(words) exp = get_parses(word) #获取当前词的解释 fool.load_userdict(dict) expwords = " ".join(fool.cut(exp)[0]) #对解释进行切词 corpus.append(expwords) vectorizer = CountVectorizer() csr_mat = vectorizer.fit_transform(corpus) transformer = TfidfTransformer() tfidf = transformer.fit_transform(csr_mat) y = np.array(categories) model = SVC() length = categories.__len__() model.fit(tfidf[0:length], y) predicted = model.predict(tfidf[length:]) #对新查询到的词进行插入操作 sql = "insert into T_Keywords(keyword,category,weight,explanation) values('%s','%s','%s','%s')" % ( word, predicted[0], 1, exp) kid = mysql.exec(sql) #爬取相关的链接并插入 hrefs = get_policy(word) for href in hrefs: title = href.get('title') url = href.get('url') sql = "insert into T_Links(title,href,kid) values('%s','%s','%s')" % ( title, url, kid) mysql.exec(sql)
import fool from fool.dictionary import Dictionary def td(): d = Dictionary() d.add_dict("./test_dict.txt") matchs = d.parse_words("什么鬼我难受香菇") for mat in matchs: print(mat.keyword) print(mat.start) print(mat.end) print(d.sizes) fool.load_userdict("./test_dict.txt") print(fool._DICTIONARY.sizes) print(fool._DICTIONARY.weights) def tcut(): text = "我在北京天安门" words, ners = fool.analysis(text) print(ners) words = fool.pos_cut(text) print(words) fool.delete_userdict() print(fool.cut(text)) if __name__ == '__main__':
parser.add_argument("-u", "--user_dict", help="use USER_DICT together with the default dictionary or DICT (if specified)") parser.add_argument("-b", "--batch_size", default=1, type = int ,help="batch size ") parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() delim = args.delimiter plim = args.pos batch_zize = args.batch_size if args.user_dict: fool.load_userdict(args.user_dict) fp = open(args.filename, 'r') if args.filename else sys.stdin lines = fp.readlines(batch_zize) while lines: lines = [ln.strip("\r\n") for ln in lines] if args.pos: result_list = fool.pos_cut(lines) for res in result_list: out_str = [plim.join(p) for p in res] print(delim.join(out_str)) else: result_list = fool.cut(lines) for res in result_list:
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"] print("no dict:", fool.cut(text, ignore=True)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) pos_words =fool.pos_cut(text) print("pos result", pos_words) words, ners = fool.analysis(text) print("ners: ", ners) ners = fool.ner(text) print("ners:", ners)
import time TimeStart = time.time() TempTime = TimeStart raw_cn = "./com_cn.txt" ## There are 20 different years, from 1998 to 2017. All_Dict = dict() for i in range(1998, 2018): All_Dict['title_' + str(i)] = list() #All_Dict['main1_' + str(i)] = list() All_Dict['main2_' + str(i)] = list() All_Dict['year_' + str(i)] = list() user_dict = "./_reference/thulac/THUOCL_it_space.txt" fool.load_userdict(user_dict) count = 0 unstructured = list() with open(raw_cn, 'r', encoding='UTF-8') as raw: for line in raw: temp = line.split('\t') if len(temp) == 3: current_year = str(temp[2].strip()) All_Dict['title_' + current_year].append(temp[0]) All_Dict['year_' + current_year].append(current_year) #All_Dict['main1_' + current_year].extend(fool.cut(temp[1])) All_Dict['main2_' + current_year].extend(fool.cut(temp[1])) else: unstructured.append(count) count += 1
def load_dict(path): fool.load_userdict(path)
"--batch_size", default=1, type=int, help="batch size ") parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() delim = args.delimiter plim = args.pos batch_zize = args.batch_size if args.user_dict: fool.load_userdict(args.user_dict) fp = open(args.filename, 'r') if args.filename else sys.stdin lines = fp.readlines(batch_zize) while lines: lines = [ln.strip("\r\n") for ln in lines] if args.pos: result_list = fool.pos_cut(lines) for res in result_list: out_str = [plim.join(p) for p in res] print(delim.join(out_str)) else: result_list = fool.cut(lines) for res in result_list: print(delim.join(res))
''' 源教程来自: https://github.com/rockyzhengwu/FoolNLTK/blob/master/README_CH.md ''' import fool path=r"C:\Users\lenvov\Desktop\my_diy_dic.txt" #txt文件保存用户本地自定义词典,每行格式为:词 权重 fool.load_userdict(path) #加载自定义词典 #词典只能定义词的权值,不能定义词的词性,故对词性标注没有帮助 #fool.delete_userdict(); #删除用户自定义词典 text="习近平觉得张构架的趣多多比希斯罗机场的巧克力味的奥利奥要贵得多。" words, ners = fool.analysis(text) #words列表保存分词后词性标注的结果(只使用自带词典不添加自定义词典),ners保存识别得到的实体(存在分词不准确但命名实体识别正确的现象,但使用自定义字典以后便可修正) # 实体识别过程得到的words列表不受自定义词典影响。一般不用 print('文本切分:',fool.cut(text),'\n') print('文本切分后进行词性标注:',fool.pos_cut(text),'\n') print('words:',words,'\n') print('实体识别',ners,'\n')
import fool fool.load_userdict(path) text = "我在北京天安门看你难受香菇" print(fool.cut(text))
import fool def processSentence(sentence): #print(fool.cut(sentence)) #print(fool.pos_cut(sentence)) try: print(fool.cut(sentence)) print(fool.pos_cut(sentence)) words, ners = fool.analysis(sentence) print(words,ners) except: pass if __name__ =="__main__": try: fool.load_userdict('data/userDictForFool.txt') pass except: pass #processSentence("一个傻子在北京") processSentence("日媒:中国第3艘航母已开建 不仅是模仿辽宁舰") processSentence("新华社:冷战时代美苏“星球大战” 前苏联被拖入军备竞赛")
import json import jieba import fool from collections import Counter path1 = '/home/xubinchen/rc_tf-master/data/results/test_predicted_bidaf.json' path2 = '/home/xubinchen/rc_tf-master/data/results/test_predicted_mlstm.json' path = '/home/xubinchen/data_test_raw.txt' jieba.load_userdict('/home/xubinchen/data_test/dict.txt') jieba.add_word('不含') jieba.add_word('不到位') fool.load_userdict('/home/xubinchen/data_test/dict.txt') jieba.add_word('不含') jieba.add_word('不到位') def ori_data(path): ori = [] with open(path) as fin: for lidx, line in enumerate(fin): sample = json.loads(line.strip()) question = sample['question'] quest_id = sample['question_id'] paragraphs = sample['documents'][0]['paragraphs'] question_type = sample['question_type'] data = [quest_id, question, question_type, paragraphs] ori.append(data) return ori