def __init__(self, kb_path, init_stop_words_path, keywords_path, set_path=None, custom_dict=None): jieba.load_userdict(custom_dict) if set_path is None: self.set_dict = read_sets(kb_path, 'sets') else: self.set_dict = read_sets(set_path) self.place_holder_dict = read_sets(kb_path, 'place_holder') self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern', 'intent_id', 'pattern') self.record_list = [ convert2record_list(idpp, self.set_dict, self.place_holder_dict) for idpp in self.id_pattern_pairs ] self.keyword_dict = read_keywords(keywords_path) self.analyzer = han_analyzer() self.processor = Processor(init_stop_words_path=init_stop_words_path) self.base_structure = base_structure self.match_patterns = match_patterns self.get_intent_classes = get_intent_classes self.long_sentence_processor = Long_sentence_processor( init_stop_words_path)
def __init__(self, kb_path, init_stop_words_path): self.set_dict = read_sets(kb_path, 'sets') self.place_holder_dict = read_sets(kb_path, 'place_holder') self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern', 'intent_id', 'pattern') self.record_list = [ convert2record_list(idpp, self.set_dict, self.place_holder_dict) for idpp in self.id_pattern_pairs ] self.analyzer = han_analyzer() self.processor = Processor(init_stop_words_path=init_stop_words_path) self.base_structure = base_structure self.match_patterns = match_patterns self.get_intent_classes = get_intent_classes
#import stanfordnlp from hanlp_parse import han_analyzer from sentence_structure_utils import base_structure from knowledge_bank_utils import get_intent_classes import numpy as np from input_process_util import Processor #%% init_stop_words_path = '../libs/init_stop_words.txt' data_path = '../../data/results/filtered_test0424.xlsx' out_data_path = '../../data/results/filtered_test_nlu_0424.xlsx' #results_path = '../../data/results/.xlsx' df = pd.read_excel(data_path,'wenti') qs = df['问题'] #%% processor = Processor(init_stop_words_path=init_stop_words_path) analyzer = han_analyzer() #%% ## step one df ['ini_remove'] = df['问题'].apply(processor.check_and_remove_ini,args=(analyzer,False)) #%% intents = [get_intent_classes(i) for i in qs] #%% df_intents = pd.DataFrame(intents) #%% df = df.merge(df_intents, left_index=True, right_index=True) #%% out_data_path = '../../data/results/filtered_test_nlu_0424.xlsx' df.to_excel(out_data_path)
## set up global variable data_path = './data/dependency_tree_test_data.xlsx' results_path = './data/dep_tree_out_1.xlsx' keep_columns = ['ask'] df = pd.read_excel(data_path, sheet_name='a_b_1') df = df[keep_columns] df.dropna(inplace=True) df.reset_index(inplace=True) #df = df.head(1000) input_column_name = 'ask' #intent_column_name = '意图' #%% ## use stanford parser print('parsing using han analyzer....') analyzer = han_analyzer() processor = Processor('../libs/init_stop_words.txt') input_data = df[input_column_name].values #%% test_data = [processor.remove_init_stop_words(i) for i in input_data] assert len(test_data) == len(input_data) df['filtered_input'] = np.array(test_data) #%% msg_list = [ base_structure(s, analyzer).print_dep_tree(print_out=False) for s in test_data ] msg_list = ['\n'.join(m) for m in msg_list] #%% df['han_dep'] = df[input_column_name].apply(get_dep_output_han, args=(analyzer, )) df['han_dep_tree'] = np.array(msg_list)
#%% if __name__ == '__main__': data_path = '../../data/raw/intent_data_clean.csv' results_path = '../../data/results/initial_stop_words_analysis.xlsx' keep_columns = ['id', '用户问句', '功能', '意图'] df = pd.read_csv(data_path, encoding='utf8') df = df[keep_columns] df.dropna(inplace=True) df.reset_index(inplace=True) #df = df.head(1000) input_column_name = '用户问句' intent_column_name = '意图' processor = Processor(init_stop_words_path='../libs/init_stop_words.txt') analyzer = han_analyzer() #%% input_data = df[input_column_name].values test_data = [i for i in input_data if processor._check_candidate(i)] #%% ## test sentence rule_map = { 1: (rule_1, True), 2: (rule_2, True), 3: (rule_3, True), 4: (rule_4, False) }
from sentence_structure_utils import base_structure import numpy as np from input_process_util import Processor #%% def get_matched_rules(sentence,processor,analyzer): check = processor.check_all_rules(sentence,analyzer) return processor.rule2name[check] #%% if __name__ == '__main__': data_path = '../../data/raw/sentence_with_prefix' results_path = '../../data/results/initial_stop_words_analysis_boyan.xlsx' #%% df = pd.read_csv(data_path,header=None,names=['sentence']) df.dropna(inplace=True) df.reset_index(inplace=True) #%% input_column_name = 'sentence' processor = Processor(init_stop_words_path='../libs/init_stop_words.txt') analyzer = han_analyzer() #%% df['remove_candidate'] = df[input_column_name].apply(processor._check_candidate) df['matched_rules'] = df[input_column_name].apply(get_matched_rules,args=(processor,analyzer,)) df['results'] = df[input_column_name].apply(processor.check_and_remove_ini,args=(analyzer,False)) df.to_excel(results_path)
def __init__(self,init_stop_words_path='init_stop_words.txt'): self.analyzer = han_analyzer() self.ini_processor = Processor(init_stop_words_path)
class Long_sentence_processor(object): """ an hanlp analyzer object for dependency parsing an other related operations """ def __init__(self,init_stop_words_path='init_stop_words.txt'): self.analyzer = han_analyzer() self.ini_processor = Processor(init_stop_words_path) @staticmethod def rule1(node): if node['level']>1 and node['postag'][0].lower() == "v": return True elif node['level']>1 and node['node'].DEPREL == "并列关系": return True else: return False @staticmethod def rule2(node): if node['postag'][0].lower() == "v": return True else: return False def check_candidates(self,sentence,verbose=True): ## simple normalization nono_list = ['如果'] sentence = re.sub(r'\s+', ' ', sentence).strip() sentence = re.sub(r',\s+', ',', sentence) sentence = re.sub(r',\s+', ',', sentence) sentence = self.ini_processor.check_and_remove_ini(sentence,self.analyzer,verbose=False) ## check for comma and space if any(w in sentence for w in [',',' ',',']): pass else: if verbose: print('no , or space found in the sentence') return False ## check sentence length if len(sentence) <10: if verbose: print('sentence length < 10') return False ## chcek each compoment length eles = re.split(r',| |,',sentence) if all(len(e)>4 for e in eles): pass else: if verbose: print('some components are <= 4 words') return False if any(w in eles[0] for w in nono_list): return False return True def check_dependency_rules(self,sentence,verbose=True): ## simple normalization sentence = re.sub(r'\s+', ' ', sentence).strip() sentence = self.ini_processor.check_and_remove_ini(sentence,self.analyzer,verbose=False) ## check for complex dependency structure ob = base_structure(sentence,self.analyzer) if verbose: ob.print_dep_tree() r1_res = ob.loop_nodes(ob.dep_tree,self.rule1) if len(r1_res)==0: return False eles = re.split(r',| |,',sentence) for e in eles: ob_e = base_structure(e,self.analyzer) if verbose: ob_e.print_dep_tree() r2_res = ob_e.loop_nodes(ob_e.dep_tree,self.rule2) if len(r2_res) == 0: return False return True def check_amd_split(self,sentence,verbose=False): ## first level check if all([self.check_candidates(sentence,verbose=verbose), self.check_dependency_rules(sentence,verbose=verbose)]): #print('check passed') sentence = re.sub(r'\s+', ' ', sentence).strip() eles = re.split(r',| |,',sentence) return eles return sentence
class NLU_match(object): """ an hanlp analyzer object for dependency parsing an other related operations """ def __init__(self, kb_path, init_stop_words_path): self.set_dict = read_sets(kb_path, 'sets') self.place_holder_dict = read_sets(kb_path, 'place_holder') self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern', 'intent_id', 'pattern') self.record_list = [ convert2record_list(idpp, self.set_dict, self.place_holder_dict) for idpp in self.id_pattern_pairs ] self.analyzer = han_analyzer() self.processor = Processor(init_stop_words_path=init_stop_words_path) self.base_structure = base_structure self.match_patterns = match_patterns self.get_intent_classes = get_intent_classes def get_dep_output_han(self, sentence): try: word_dict, word_objs = self.analyzer.dep_parse(sentence, False) res = [(w['LEMMA'], w['POSTAG'], w['DEPREL'], w['HEAD_LEMMA']) for w in word_dict] except: print(sentence) res = None return res @staticmethod def find_levels(node): if node['level'] < 3: return True else: return False @staticmethod def find_levels2(node): if node['level'] < 5: return True else: return False def match(self, sentence, deep_match=False, match_intent=False): sentence = self.processor.check_and_remove_ini(sentence, self.analyzer, False) res = self.base_structure(sentence, self.analyzer) eles = [ i['lemma'] for i in res.loop_nodes(res.dep_tree, self.find_levels) ] #print(eles) eles2 = [ i['lemma'] for i in res.loop_nodes(res.dep_tree, self.find_levels2) ] if len(eles) < 1: print('log: your input sentence is empty') return None intent_classes = None if match_intent: intent_classes = self.get_intent_classes(sentence) ## start matching ans = self.match_patterns(eles, self.record_list, 0.4, 0.7, match_intent, intent_classes) if ans and deep_match and len(eles2) > len(eles): print('log: level > 2 info used for match') ans = self.match_patterns(eles2, ans, 0.3, 0.6) return ans
class RB2(object): """ rule base 2 functionalities an hanlp analyzer object for dependency parsing an other related operations """ def __init__(self, kb_path, init_stop_words_path, keywords_path, set_path=None, custom_dict=None): jieba.load_userdict(custom_dict) if set_path is None: self.set_dict = read_sets(kb_path, 'sets') else: self.set_dict = read_sets(set_path) self.place_holder_dict = read_sets(kb_path, 'place_holder') self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern', 'intent_id', 'pattern') self.record_list = [ convert2record_list(idpp, self.set_dict, self.place_holder_dict) for idpp in self.id_pattern_pairs ] self.keyword_dict = read_keywords(keywords_path) self.analyzer = han_analyzer() self.processor = Processor(init_stop_words_path=init_stop_words_path) self.base_structure = base_structure self.match_patterns = match_patterns self.get_intent_classes = get_intent_classes self.long_sentence_processor = Long_sentence_processor( init_stop_words_path) def get_dep_output_han(self, sentence): try: word_dict, word_objs = self.analyzer.dep_parse(sentence, False) res = [(w['LEMMA'], w['POSTAG'], w['DEPREL'], w['HEAD_LEMMA']) for w in word_dict] except: print(sentence) res = None return res @staticmethod def find_levels(node): if node['level'] < 3: return True else: return False @staticmethod def find_levels2(node): if node['level'] < 5: return True else: return False @staticmethod def filter_and_rank(res_obj, find_level_func): #eles = [i['lemma'] for i in res_obj.loop_nodes(res_obj.dep_tree,find_level_func)] eles = [(i['id'], i['lemma']) for i in res_obj.loop_nodes(res_obj.dep_tree, find_level_func)] eles = sorted(eles, key=lambda x: x[0]) eles = [x[1] for x in eles] return eles def match_one(self, sentence, deep_match=False, match_intent=False, topn=5): sentence = self.processor.check_and_remove_ini(sentence, self.analyzer, False) res = self.base_structure(sentence, self.analyzer) #eles = [i['lemma'] for i in res.loop_nodes(res.dep_tree,self.find_levels)] eles = self.filter_and_rank(res, self.find_levels) #print(eles) #eles2 = [i['lemma'] for i in res.loop_nodes(res.dep_tree,self.find_levels2)] eles2 = self.filter_and_rank(res, self.find_levels2) #print(eles2) if len(eles) < 1: print('log: your input sentence is empty') return None intent_classes = None if match_intent: intent_classes = self.get_intent_classes(sentence) ## start matching ans = self.match_patterns(eles, self.record_list, self.keyword_dict, 0.4, 0.7, match_intent, intent_classes) if ans and deep_match and len(eles2) > len(eles): print('log: level > 2 info used for match') ans = self.match_patterns(eles2, ans, self.keyword_dict, 0.3, 0.6) return ans[:topn] def match(self, sentence, deep_match=False, match_intent=False, topn=5, check_long_sentence=True): sentence = sentence.lower() if check_long_sentence: processed_inputs = self.long_sentence_processor.check_amd_split( sentence) logging.info(processed_inputs) if isinstance(processed_inputs, (list, )): res = { 'answer1': self.match_one(processed_inputs[0], deep_match=deep_match, match_intent=match_intent, topn=topn), 'asnwer2': self.match_one(processed_inputs[1], deep_match=deep_match, match_intent=match_intent, topn=topn) } else: res = { 'answer1': self.match_one(processed_inputs, deep_match=deep_match, match_intent=match_intent, topn=topn), 'asnwer2': None } else: res = { 'answer1': self.match_one(processed_inputs, deep_match=deep_match, match_intent=match_intent, topn=topn), 'asnwer2': None } return res def evaluate_pattern(self, sentence, input_pattern, deep_match=False, match_intent=False): try: record = convert2record_list( ['input_id', input_pattern, 'NA', 'NA'], self.set_dict, self.place_holder_dict) #sentence = sentence.lower() sentence = self.processor.check_and_remove_ini( sentence, self.analyzer, False) #res = self.base_structure(sentence,self.analyzer) #eles = self.filter_and_rank(res,self.find_levels) #eles2 = self.filter_and_rank(res,self.find_levels2) ## for now, just use jeiba eles2 = list(jieba.cut(sentence)) eles2 = [e.lower().replace(" ", "") for e in eles2] ans = self.match_patterns(eles2, [record], self.keyword_dict, 0.0, 0.0, match_intent, None, multi=False) except Exception as e: #logging.warning('Problem with input') logging.warning(e) return None if len(ans) > 0: return ans[0] else: return None