def draw_1(s): m = s l = fool.cut(s)[0] print(l) p = product_grammar(m) grammar = CFG.fromstring(""" S ->NP V NP U L|NP U NP V L| NP U L V NP|L U NP V NP|L V NP U NP|NP V L U NP NP -> N N|r NP|NP A NP|M Q NP|N|NP U NP|A U NP|N NP|NP C NP|NP U|M NP VP ->V|V NP|V VP|A VP|VP NP|VP U|VP C VP|VP P|VP uguo V -> v|vi|vshi N ->n|nr|t|ns|f|nx|nz R ->r C ->c P ->p L ->R|R NP U ->ude|y A ->a|d|ad M ->m Q ->q """ + p) cp = nltk.ChartParser(grammar) tree = cp.parse(l) stree = [] for s in tree: st = [] #s.draw() for i in range(len(s)): st.append([s[i].label(), ''.join(s[i].leaves())]) stree.append(st) return stree
def create_batches(self, train_file, batch_size, sequence_length): self.x_data = [] self.y_data = [] padding_index = self.vocab_size - 1 for line in open(train_file): line = line.decode('utf-8').replace('\n', '') text, label = line.strip().split('\t') tokens = fool.cut(re.sub(r'\w+', ' L', text)) seq_ids = [self.token_dictionary.get(token) for token in tokens[0] if token not in self.stop_words and self.token_dictionary.get(token) is not None and not chinese.is_other_all(token)] seq_ids = seq_ids[:sequence_length] for _ in range(len(seq_ids), sequence_length): seq_ids.append(padding_index) self.x_data.append(seq_ids) self.y_data.append(self.label_dictionary.get(label)) self.num_batches = int(len(self.x_data) / batch_size) self.x_data = self.x_data[:self.num_batches * batch_size] self.y_data = self.y_data[:self.num_batches * batch_size] self.x_data = np.array(self.x_data, dtype=int) self.y_data = np.array(self.y_data, dtype=int) self.x_batches = np.split(self.x_data.reshape(batch_size, -1), self.num_batches, 1) self.y_batches = np.split(self.y_data.reshape(batch_size, -1), self.num_batches, 1) self.pointer = 0
def word2vec(line): word2id_list = [0] * len(vocab_dict) entities = {} for x in person_names: if x in line: line = line.replace(x, " nnt ") entities[0] = x for x in movie_names: if x in line: line = line.replace(x, " nm ") entities[1] = x for x in genre_names: if x in line: line = line.replace(x, " ng ") entities[2] = x words, ner = fool.analysis(line) for entity in ner[0]: if (entity[2] == "person" or entity[2] == "company"): line = line.replace(entity[3], " nnt ") for word in fool.cut(line)[0]: # for word in list(jieba.cut(line)): try: word2id_list[int(vocab_dict[word])] = 1 except: pass return word2id_list, entities
def tcut(): text = "我在北京天安门" words, ners = fool.analysis(text) print(ners) words = fool.pos_cut(text) print(words) fool.delete_userdict() print(fool.cut(text))
def get_segmentation(line, print_=False): ''' 获取分词文本 ''' res = fool.cut(line.strip()) if print_: print(','.join(res[0])) return res[0]
class clf_model: def __init__(self): self.model = "" self.vectorizer = "" def train(self): d_train = pd.read_excel("data_train.xlsx") d_train.sentence_train = d_train.sentence_train.apply(self.fun_clean) print("训练样本 = %d" % len(d_train)) self.vectorizer = TfidfVectorizer(analyzer="word", token_pattern=r"(?u)\b\w+\b") # 注意,这里自己指定token_pattern,否则sklearn会自动将一个字长度的单词过滤筛除 features = self.vectorizer.fit_transform(d_train.sentence_train) print("训练样本特征表长度为 " + str(features.shape)) self.model = LogisticRegression(C=10) self.model.fit(features, d_train.label) def predict_model(self, sentence): if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]: return 1, 0.8 sent_features = self.vectorizer.transform([sentence]) pre_test = self.model.predict_proba(sent_features).tolist()[0] clf_result = pre_test.index(max(pre_test)) score = max(pre_test) return clf_result, score def predict_rule(self, sentence): sentence = sentence.replace(' ', '') if re.findall(r'不需要|不要|停止|终止|退出|不买|不定|不订', sentence): return 2, 0.8 elif re.findall(r'订|定|预定|买|购', sentence) or sentence in ["好的","需要","是的","要的","好","要","是"]: return 1, 0.8 else: return 0, 0.8 def fun_clean(self, sentence): words, ners = fool.analysis(sentence) ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True) if ners: for ner in ners: sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ') word_lst = [w for w in fool.cut(sentence)[0] if w not in stopwords] output_str = ' '.join(word_lst) output_str = re.sub(r'\s+', ' ', output_str) return output_str.strip()
def get_segmentation(line, print_=False): ''' 获取分词文本 ''' load_dict('F:\\114代码\\i\\wordSegment\\kw.txt') res = fool.cut(line.strip()) if print_: print(','.join(res[0])) return res[0]
def classify(word, dict): corpus = [] sql = "select * from T_Keywords" results = mysql.select(sql) for category in categories: words = "" for result in results: if result[2] == category: fool.load_userdict(dict) line = " ".join(fool.cut(result[3])[0]) #将每一类的分词拼接成一个字符串 words = words + line corpus.append(words) exp = get_parses(word) #获取当前词的解释 fool.load_userdict(dict) expwords = " ".join(fool.cut(exp)[0]) #对解释进行切词 corpus.append(expwords) vectorizer = CountVectorizer() csr_mat = vectorizer.fit_transform(corpus) transformer = TfidfTransformer() tfidf = transformer.fit_transform(csr_mat) y = np.array(categories) model = SVC() length = categories.__len__() model.fit(tfidf[0:length], y) predicted = model.predict(tfidf[length:]) #对新查询到的词进行插入操作 sql = "insert into T_Keywords(keyword,category,weight,explanation) values('%s','%s','%s','%s')" % ( word, predicted[0], 1, exp) kid = mysql.exec(sql) #爬取相关的链接并插入 hrefs = get_policy(word) for href in hrefs: title = href.get('title') url = href.get('url') sql = "insert into T_Links(title,href,kid) values('%s','%s','%s')" % ( title, url, kid) mysql.exec(sql)
def processSentence(sentence): #print(fool.cut(sentence)) #print(fool.pos_cut(sentence)) try: print(fool.cut(sentence)) print(fool.pos_cut(sentence)) words, ners = fool.analysis(sentence) print(words,ners) except: pass
def test_seg(self): # 甲骨分词 jiagu_result = [] for sen in sentence: jiagu_result.append(jiagu.seg(sen)) # 结巴分词 jieba_result = [] for sen in sentence: jieba_result.append(jieba.cut(sen)) # 哈工大LTP pyltp_result = [] for sen in sentence: pyltp_result.append(self.ltpseg.segment(sen)) # HanLP pyhanlp_result = [] for sen in sentence: words = [] for term in pyhanlp.HanLP.segment(sen): words.append(term.word) pyhanlp_result.append(words) # 清华分词 thulac_result = [] for sen in sentence: thulac_result.append(self.thu1.cut(sen, text=True).split()) # NLPIR pynlpir_result = [] for sen in sentence: pynlpir_result.append(pynlpir.segment(sen, pos_tagging=False)) # SnowNLP snownlp_result = [] for sen in sentence: snownlp_result.append(snownlp.SnowNLP(sen).words) # FoolNLTK fool_result = fool.cut(sentence) for sen, jgr, jbr, ltp, hanlp, thu, nlpir, snow, fnltk, in zip(sentence, jiagu_result, jieba_result, pyltp_result, pyhanlp_result, thulac_result, pynlpir_result, snownlp_result, fool_result): print('句子:\t\t' + sen + '\n') print('结巴:\t\t' + ' '.join(jbr)) print('HanLP:\t\t' + ' '.join(hanlp)) print('SnowNLP\t\t' + ' '.join(snow)) print('FoolNLTK\t' + ' '.join(fnltk)) print('甲骨:\t\t' + ' '.join(jgr)) print('哈工大:\t' + ' '.join(ltp)) print('清华:\t\t' + ' '.join(thu)) print('NLPIR:\t\t' + ' '.join(nlpir)) print('\n')
def cutNewsTitleByFool(fromfilename, tofilename): try: ffile = open(fromfilename, 'r', encoding='utf8') tfile = open(tofilename, 'w', encoding='utf8') title = ffile.readline() while title: tfile.write(' '.join(fool.cut(title))) title = ffile.readline() ffile.close() tfile.close() except Exception as e: print(e)
def transform_raw(self, text, sequence_length): if not isinstance(text, unicode): text = text.decode('utf-8') tokens = fool.cut(re.sub(r'\w+', ' L', text))[0] x = [self.token_dictionary.get(token) for token in tokens if not chinese.is_other_all(token) and token not in self.stop_words] x = x[:sequence_length] padding_index = self.vocab_size - 1 for _ in range(len(x), sequence_length): x.append(padding_index) self.words = [token for token in tokens if not chinese.is_other_all(token)] return x
def before_data_clean(): comment_data = pd.read_excel('F:/learning/weibo/Result.xlsx') print (comment_data) text = ",".join(comment_data[0]) text =str(text) print (text) a = fool.cut(text) print (a) cut_text = ' '.join(a[0]) instance = pd.DataFrame(a[0], columns=["instance"]) pd.DataFrame(instance).to_excel('F:/learning/weibo/instance.xls', encoding='utf_8_sig') c = Counter(a[0]) c.most_common(30) pd.DataFrame(c.most_common(30)).to_excel('F:/learning/weibo/enci.xls', encoding='utf_8_sig')
def getWordStatsWithFool(data): """ :param data: tuple类型的数据,data【n】【0】是弹幕数据 :return: """ wordFrequency = {} for i in range(len(data)): barrage = data[i][0] # print(fool.cut(barrage)) for word in fool.cut(barrage)[0]: # print(word) if word in wordFrequency.keys(): wordFrequency[word] += 1 else: wordFrequency[word] = 1 return wordFrequency
def segmentation_conversion_helper(fn, list_line, sub_folder, phrase_syllable="phrase"): list_line_char = [[line[0], line[1], ' '.join(fool.cut(line[2])[0])] for line in list_line if len(line[2].replace(" ", "")) > 0] list_line_pinyin = [[ line[0], line[1], pinyin.get(line[2], format='strip', delimiter=' ') ] for line in list_line if len(line[2].replace(" ", "")) > 0] write_line(filename=os.path.join(mandarin_kugou_root, sub_folder, fn + '_' + phrase_syllable + '_char.txt'), list_line=list_line_char) write_line(filename=os.path.join( mandarin_kugou_root, sub_folder, fn + '_' + phrase_syllable + '_pinyin.txt'), list_line=list_line_pinyin)
def fun_clean(self, sentence): """ 预处理函数 :输入 用户输入语句: :输出 预处理结果: """ # 使用foolnltk进行实体识别 words, ners = fool.analysis(sentence) # 对识别结果按长度倒序排序 ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True) # 如果有实体被识别出来,就将实体的字符串替换成实体类别的字符串(目的是看成一类单词,看成一种共同的特征) if ners: for ner in ners: sentence = sentence.replace(ner[-1], ' ' + ner[2] + ' ') # 分词,并去除停用词 word_lst = [w for w in fool.cut(sentence)[0] if w not in stopwords] output_str = ' '.join(word_lst) output_str = re.sub(r'\s+', ' ', output_str) return output_str.strip()
def draw_1(s): m = s l = fool.cut(s)[0] print(l) p = product_grammar(m) grammar = CFG.fromstring(""" S -> NP L NP|NP vshi NP y|NP L P NP|NP L P NP F|NP vshi R|T vshi R NP -> nr nr| nr ude n| nr n|NP ude NP|NP NP|z ude n|a ude n|v ude n|nr|n|b ude|ns ude|ns|ns ude NP|m n|m q n|A\ |d m|m|NP c NP|NP p NP VP -> v NP|v VP L ->vshi d vshi P ->p|vi p F ->f T ->t R ->r|r NP|r ude NP A ->a|d a|m q|d a ude """ + p) cp = nltk.ChartParser(grammar) trees = cp.parse(l) for s in trees: print(s)
def create_dictionary(self, train_file, save_dir): """ 从原始文本文件中创建字典 :param train_file: 原始训练文件文档 :param save_dir: 词典保存路径 :return: token_dictionary, label_dictionary, labels, vocab_size, n_classes """ token_dictionary = {} token_index = 0 label_dictionary = {} label_index = 0 labels = [] for line in open(train_file): line = line.decode('utf-8').replace('\n', '') text, label = line.strip().split('\t') tokens = fool.cut(re.sub(r'\w+', ' L', text)) # print(tokens) if label not in label_dictionary: label_dictionary[label] = label_index labels.append(label) label_index += 1 for token in tokens[0]: if token not in token_dictionary and not chinese.is_other_all(token) and token not in self.stop_words: token_dictionary[token] = token_index token_index += 1 token_dictionary['</s>'] = token_index token_index += 1 self.vocab_size = len(token_dictionary) self.n_classes = len(label_dictionary) print('Corpus Vocabulary:{0}, Classes:{1}'.format(self.vocab_size, self.n_classes)) with open(save_dir + 'dictionary', 'w') as f: pickle.dump((token_dictionary, label_dictionary), f) self.token_dictionary = token_dictionary self.label_dictionary = label_dictionary self.labels = labels
def fun_clean(self, sentence): # 函数目标:预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 # input:sentence(用户输入语句) # output:sentence(预处理结果) """ 预处理函数,将必要的实体转换成统一符号(利于分类准确),去除停用词等 """ words, ners = fool.analysis(sentence) ners = ners[0].sort(key=lambda x: len(x[-1]), reverse=True) if ners: for ner in ners: sentence.replace(ner[-1], ' ' + ner[2] + ' ') wordslist = fool.cut(sentence)[0] wordslist = [word for word in wordslist if word not in stopwords] sentence = ' '.join(wordslist) sentence = re.sub(r'\s+', ' ', sentence).strip() return sentence
def predict_model(self, sentence): # 函数目标:使用意图分类模型预测意图 # input:sentence(用户输入) # output:clf_result(意图类别),score(意图分数) # -------------- # 对样本中没有的特殊情况做特别判断 if sentence in ["好的", "需要", "是的", "要的", "好", "要", "是"]: return 1, 0.8 # -------------- """ TODO:利用已训练好的意图分类模型进行意图识别 """ sent = self.fun_clean(' '.join(fool.cut(sentence)[0])) inputs = self.vectorizer.transform([sent]) scores = self.model.predict_proba(inputs)[0] clf_result = np.argmax(scores, axis=0) score = scores[clf_result] return clf_result, score
def segment_lyric_convert_pinyin_mir1k(): openCC = OpenCC('tw2s') folder_lyrics_mir1k = os.path.join(mir1k_root, 'Lyrics') filenames_lyrics_mir1k = list( set(get_filenames_in_folder(folder_lyrics_mir1k))) for fn in filenames_lyrics_mir1k: fn_txt = os.path.join(folder_lyrics_mir1k, fn + '.txt') try: list_line = read_mir1k_lyrics(fn_txt) line_simplified = openCC.convert(list_line[0]) line_pinyin = pinyin.get(line_simplified, format='strip', delimiter=' ') line_char = ' '.join(fool.cut(line_simplified)[0]) write_lyrics_one_line(filename=os.path.join( mir1k_root, 'annotation', fn + '_phrase_char.txt'), line=line_char) write_lyrics_one_line(filename=os.path.join( mir1k_root, 'annotation', fn + '_phrase_pinyin.txt'), line=line_pinyin) except UnicodeDecodeError: print(fn)
import fool import xlrd import xlwt workbook = xlwt.Workbook(encoding='ascii') worksheet = workbook.add_sheet('My Worksheet') data = xlrd.open_workbook('zwcg.xls') table = data.sheet_by_name(u'Sheet1') nrows = table.nrows for i in range(nrows): text = table.row_values(i) worklist = fool.cut(text[0]) for j in range(len(worklist)): worksheet.write(i, j, label=worklist[j]) workbook.save('result.xls')
""" https://github.com/rockyzhengwu/FoolNLTKhttps://github.com/rockyzhengwu/FoolNLTK """ import q from pyhanlp import HanLP, JClass with open("../test_data/1.txt", "r") as rf: text = rf.read() text = text[:502] import fool result = fool.cut(text) print(" ".join(result[0])) import fool words, ners = fool.analysis(text) print(ners) """ [[ (0, 5, 'company', '新浪科技'), (6, 9, 'location', '北京'), (10, 18, 'time', '4月29日晚间'), (20, 25, 'company', '搜狗公司'), (24, 27, 'time', '今天'), (31, 37, 'time', '3月31日'), (37, 47, 'time', '2019年第一季度'), (60, 65, 'time', '第一季度'),
def foolnltk(self, text): # FoolNLTK fool_result = fool.cut(text) return fool_result
''' 源教程来自: https://github.com/rockyzhengwu/FoolNLTK/blob/master/README_CH.md ''' import fool path=r"C:\Users\lenvov\Desktop\my_diy_dic.txt" #txt文件保存用户本地自定义词典,每行格式为:词 权重 fool.load_userdict(path) #加载自定义词典 #词典只能定义词的权值,不能定义词的词性,故对词性标注没有帮助 #fool.delete_userdict(); #删除用户自定义词典 text="习近平觉得张构架的趣多多比希斯罗机场的巧克力味的奥利奥要贵得多。" words, ners = fool.analysis(text) #words列表保存分词后词性标注的结果(只使用自带词典不添加自定义词典),ners保存识别得到的实体(存在分词不准确但命名实体识别正确的现象,但使用自定义字典以后便可修正) # 实体识别过程得到的words列表不受自定义词典影响。一般不用 print('文本切分:',fool.cut(text),'\n') print('文本切分后进行词性标注:',fool.pos_cut(text),'\n') print('words:',words,'\n') print('实体识别',ners,'\n')
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = "我在北京天安门看你难受香菇,一一千四百二十九" print("no dict:", fool.cut(text)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) words, ners = fool.analysis(text) print("ners: ", ners)
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"] print("no dict:", fool.cut(text, ignore=True)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) pos_words = fool.pos_cut(text) print("pos result", pos_words) words, ners = fool.analysis(text) print("ners: ", ners) ners = fool.ner(text) print("ners:", ners)
metavar="DELIM", nargs='?', const='_', help= "enable POS tagging; if DELIM is specified, use DELIM instead of '_' for POS delimiter" ) parser.add_argument("-D", "--dict", help="use DICT as dictionary") parser.add_argument( "-u", "--user-dict", help= "use USER_DICT together with the default dictionary or DICT (if specified)" ) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() delim = args.delimiter fp = open(args.filename, 'r') if args.filename else sys.stdin ln = fp.readline() while ln: l = ln.rstrip('\r\n') result = delim.join(fool.cut(ln.rstrip('\r\n'))) print(result) ln = fp.readline() fp.close()
import fool text = "2017年12月29日,上海嘉定公安机关接到报警电话,市民称其接到一家自称为某装饰公司员工的电话,对方在向其推销房屋装潢工程时能准确说出其姓名、手机号、房产地址等个人信息,该市民感觉个人信息被侵犯,于是报警。" print(fool.cut(text)) # ['一个', '傻子', '在', '北京']
def text_to_vec(self, text): words = fool.cut(text)[0] return list(map(self.to_num, words))
def fool_cut(text): cut = fool.cut(text) return " ".join(cut[0])
parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() delim = args.delimiter plim = args.pos batch_zize = args.batch_size if args.user_dict: fool.load_userdict(args.user_dict) fp = open(args.filename, 'r') if args.filename else sys.stdin lines = fp.readlines(batch_zize) while lines: lines = [ln.strip("\r\n") for ln in lines] if args.pos: result_list = fool.pos_cut(lines) for res in result_list: out_str = [plim.join(p) for p in res] print(delim.join(out_str)) else: result_list = fool.cut(lines) for res in result_list: print(delim.join(res)) lines = fp.readlines(batch_zize) fp.close()
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"] print("no dict:", fool.cut(text, ignore=True)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) pos_words =fool.pos_cut(text) print("pos result", pos_words) words, ners = fool.analysis(text) print("ners: ", ners) ners = fool.ner(text) print("ners:", ners)
All_Dict['main2_' + str(i)] = list() All_Dict['year_' + str(i)] = list() user_dict = "./_reference/thulac/THUOCL_it_space.txt" fool.load_userdict(user_dict) count = 0 unstructured = list() with open(raw_cn, 'r', encoding='UTF-8') as raw: for line in raw: temp = line.split('\t') if len(temp) == 3: current_year = str(temp[2].strip()) All_Dict['title_' + current_year].append(temp[0]) All_Dict['year_' + current_year].append(current_year) #All_Dict['main1_' + current_year].extend(fool.cut(temp[1])) All_Dict['main2_' + current_year].extend(fool.cut(temp[1])) else: unstructured.append(count) count += 1 if count % 1000 == 0: print("Time for 1000 sentences: %.2f" % (time.time() - TempTime)) TempTime = time.time() print("unstructured sample:") print(unstructured) for i in range(1998, 2018): with open('./structured_data/fool/com_cn_title_' + str(i) + '.txt', 'w', encoding='UTF-8') as f: