예제 #1
0
    def __init__(self,
                 kb_path,
                 init_stop_words_path,
                 keywords_path,
                 set_path=None,
                 custom_dict=None):
        jieba.load_userdict(custom_dict)
        if set_path is None:
            self.set_dict = read_sets(kb_path, 'sets')
        else:
            self.set_dict = read_sets(set_path)
        self.place_holder_dict = read_sets(kb_path, 'place_holder')
        self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern',
                                             'intent_id', 'pattern')
        self.record_list = [
            convert2record_list(idpp, self.set_dict, self.place_holder_dict)
            for idpp in self.id_pattern_pairs
        ]

        self.keyword_dict = read_keywords(keywords_path)
        self.analyzer = han_analyzer()
        self.processor = Processor(init_stop_words_path=init_stop_words_path)
        self.base_structure = base_structure
        self.match_patterns = match_patterns
        self.get_intent_classes = get_intent_classes
        self.long_sentence_processor = Long_sentence_processor(
            init_stop_words_path)
예제 #2
0
    def __init__(self, kb_path, init_stop_words_path):
        self.set_dict = read_sets(kb_path, 'sets')
        self.place_holder_dict = read_sets(kb_path, 'place_holder')
        self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern',
                                             'intent_id', 'pattern')
        self.record_list = [
            convert2record_list(idpp, self.set_dict, self.place_holder_dict)
            for idpp in self.id_pattern_pairs
        ]

        self.analyzer = han_analyzer()
        self.processor = Processor(init_stop_words_path=init_stop_words_path)
        self.base_structure = base_structure
        self.match_patterns = match_patterns
        self.get_intent_classes = get_intent_classes
예제 #3
0
#import stanfordnlp
from hanlp_parse import han_analyzer
from sentence_structure_utils import base_structure
from knowledge_bank_utils import get_intent_classes
import numpy as np
from input_process_util import Processor
#%%
init_stop_words_path = '../libs/init_stop_words.txt'
data_path = '../../data/results/filtered_test0424.xlsx'
out_data_path = '../../data/results/filtered_test_nlu_0424.xlsx'
#results_path = '../../data/results/.xlsx'
df = pd.read_excel(data_path,'wenti')
qs = df['问题']

#%%
processor = Processor(init_stop_words_path=init_stop_words_path)
analyzer = han_analyzer()
#%%

## step one 
df ['ini_remove'] = df['问题'].apply(processor.check_and_remove_ini,args=(analyzer,False))
#%%
intents = [get_intent_classes(i) for i in qs]
#%%
df_intents = pd.DataFrame(intents)
#%%

df = df.merge(df_intents, left_index=True, right_index=True)
#%%
out_data_path = '../../data/results/filtered_test_nlu_0424.xlsx'
df.to_excel(out_data_path)
예제 #4
0
 ## set up global variable
 data_path = './data/dependency_tree_test_data.xlsx'
 results_path = './data/dep_tree_out_1.xlsx'
 keep_columns = ['ask']
 df = pd.read_excel(data_path, sheet_name='a_b_1')
 df = df[keep_columns]
 df.dropna(inplace=True)
 df.reset_index(inplace=True)
 #df = df.head(1000)
 input_column_name = 'ask'
 #intent_column_name = '意图'
 #%%
 ## use stanford parser
 print('parsing using han analyzer....')
 analyzer = han_analyzer()
 processor = Processor('../libs/init_stop_words.txt')
 input_data = df[input_column_name].values
 #%%
 test_data = [processor.remove_init_stop_words(i) for i in input_data]
 assert len(test_data) == len(input_data)
 df['filtered_input'] = np.array(test_data)
 #%%
 msg_list = [
     base_structure(s, analyzer).print_dep_tree(print_out=False)
     for s in test_data
 ]
 msg_list = ['\n'.join(m) for m in msg_list]
 #%%
 df['han_dep'] = df[input_column_name].apply(get_dep_output_han,
                                             args=(analyzer, ))
 df['han_dep_tree'] = np.array(msg_list)

#%%
if __name__ == '__main__':

    data_path = '../../data/raw/intent_data_clean.csv'
    results_path = '../../data/results/initial_stop_words_analysis.xlsx'
    keep_columns = ['id', '用户问句', '功能', '意图']
    df = pd.read_csv(data_path, encoding='utf8')
    df = df[keep_columns]
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    #df = df.head(1000)
    input_column_name = '用户问句'
    intent_column_name = '意图'
    processor = Processor(init_stop_words_path='../libs/init_stop_words.txt')
    analyzer = han_analyzer()

    #%%
    input_data = df[input_column_name].values
    test_data = [i for i in input_data if processor._check_candidate(i)]

    #%%
    ## test sentence
    rule_map = {
        1: (rule_1, True),
        2: (rule_2, True),
        3: (rule_3, True),
        4: (rule_4, False)
    }
from sentence_structure_utils import base_structure
import numpy as np
from input_process_util import Processor

#%%

def get_matched_rules(sentence,processor,analyzer):
    check = processor.check_all_rules(sentence,analyzer)
    return processor.rule2name[check]

#%%
if __name__ == '__main__':
    
    data_path = '../../data/raw/sentence_with_prefix'
    results_path = '../../data/results/initial_stop_words_analysis_boyan.xlsx'
    
    #%%
    df = pd.read_csv(data_path,header=None,names=['sentence'])
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    #%%
    input_column_name = 'sentence' 
    processor = Processor(init_stop_words_path='../libs/init_stop_words.txt')
    analyzer = han_analyzer()
    #%%
    df['remove_candidate'] = df[input_column_name].apply(processor._check_candidate)
    df['matched_rules'] = df[input_column_name].apply(get_matched_rules,args=(processor,analyzer,))
    df['results'] = df[input_column_name].apply(processor.check_and_remove_ini,args=(analyzer,False))   
    df.to_excel(results_path)
 
예제 #7
0
 def __init__(self,init_stop_words_path='init_stop_words.txt'):
     self.analyzer = han_analyzer()
     self.ini_processor = Processor(init_stop_words_path)
예제 #8
0
class Long_sentence_processor(object):
    """
    an hanlp analyzer object for dependency parsing an other related operations 
    """
    def __init__(self,init_stop_words_path='init_stop_words.txt'):
        self.analyzer = han_analyzer()
        self.ini_processor = Processor(init_stop_words_path)
        
    @staticmethod
    def rule1(node):
        if node['level']>1 and node['postag'][0].lower() == "v":
            return True
        elif node['level']>1 and node['node'].DEPREL == "并列关系":
            return True
        else:
            return False
        
    @staticmethod
    def rule2(node):
        if node['postag'][0].lower() == "v":
            return True
        else:
            return False
        

    def check_candidates(self,sentence,verbose=True):
        ## simple normalization
        nono_list = ['如果']
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        sentence = re.sub(r',\s+', ',', sentence)
        sentence = re.sub(r',\s+', ',', sentence)
        sentence = self.ini_processor.check_and_remove_ini(sentence,self.analyzer,verbose=False)
        
        ## check for comma and space
        if any(w in sentence for w in [',',' ',',']):
            pass
        else:
            if verbose:
                print('no , or space found in the sentence')
            return False
        
        ## check sentence length 
        if len(sentence) <10:
            if verbose:
                print('sentence length < 10')
            return False
        
        ## chcek each compoment length
        eles = re.split(r',| |,',sentence)
        if all(len(e)>4 for e in eles):
            pass
        else:
            if verbose:
                print('some components are <= 4 words')
            return False
        
        if any(w in eles[0] for w in nono_list):
            return False
        
        return True
    
    def check_dependency_rules(self,sentence,verbose=True):
        ## simple normalization
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        sentence = self.ini_processor.check_and_remove_ini(sentence,self.analyzer,verbose=False)
        
        ## check for complex dependency structure
        ob = base_structure(sentence,self.analyzer)
        if verbose:
            ob.print_dep_tree()
            
        r1_res = ob.loop_nodes(ob.dep_tree,self.rule1)
        if len(r1_res)==0:
            return False
        
        eles = re.split(r',| |,',sentence)
        for e in eles:
            ob_e =  base_structure(e,self.analyzer)
            if verbose:
                ob_e.print_dep_tree()
            r2_res = ob_e.loop_nodes(ob_e.dep_tree,self.rule2)
            if len(r2_res) == 0:
                return False
            
        return True
    
    def check_amd_split(self,sentence,verbose=False):
        ## first level check 
        if all([self.check_candidates(sentence,verbose=verbose),
                self.check_dependency_rules(sentence,verbose=verbose)]):
            #print('check passed')
            sentence = re.sub(r'\s+', ' ', sentence).strip()
            eles = re.split(r',| |,',sentence)
            return eles
        
        return sentence
예제 #9
0
class NLU_match(object):
    """
    an hanlp analyzer object for dependency parsing an other related operations 
    """
    def __init__(self, kb_path, init_stop_words_path):
        self.set_dict = read_sets(kb_path, 'sets')
        self.place_holder_dict = read_sets(kb_path, 'place_holder')
        self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern',
                                             'intent_id', 'pattern')
        self.record_list = [
            convert2record_list(idpp, self.set_dict, self.place_holder_dict)
            for idpp in self.id_pattern_pairs
        ]

        self.analyzer = han_analyzer()
        self.processor = Processor(init_stop_words_path=init_stop_words_path)
        self.base_structure = base_structure
        self.match_patterns = match_patterns
        self.get_intent_classes = get_intent_classes

    def get_dep_output_han(self, sentence):
        try:
            word_dict, word_objs = self.analyzer.dep_parse(sentence, False)
            res = [(w['LEMMA'], w['POSTAG'], w['DEPREL'], w['HEAD_LEMMA'])
                   for w in word_dict]
        except:
            print(sentence)
            res = None
        return res

    @staticmethod
    def find_levels(node):
        if node['level'] < 3:
            return True
        else:
            return False

    @staticmethod
    def find_levels2(node):
        if node['level'] < 5:
            return True
        else:
            return False

    def match(self, sentence, deep_match=False, match_intent=False):
        sentence = self.processor.check_and_remove_ini(sentence, self.analyzer,
                                                       False)
        res = self.base_structure(sentence, self.analyzer)
        eles = [
            i['lemma'] for i in res.loop_nodes(res.dep_tree, self.find_levels)
        ]
        #print(eles)
        eles2 = [
            i['lemma'] for i in res.loop_nodes(res.dep_tree, self.find_levels2)
        ]
        if len(eles) < 1:
            print('log: your input sentence is empty')
            return None

        intent_classes = None
        if match_intent:
            intent_classes = self.get_intent_classes(sentence)

        ## start matching
        ans = self.match_patterns(eles, self.record_list, 0.4, 0.7,
                                  match_intent, intent_classes)
        if ans and deep_match and len(eles2) > len(eles):
            print('log: level > 2 info used for match')
            ans = self.match_patterns(eles2, ans, 0.3, 0.6)

        return ans
예제 #10
0
class RB2(object):
    """
    rule base 2 functionalities 
    an hanlp analyzer object for dependency parsing an other related operations 
    """
    def __init__(self,
                 kb_path,
                 init_stop_words_path,
                 keywords_path,
                 set_path=None,
                 custom_dict=None):
        jieba.load_userdict(custom_dict)
        if set_path is None:
            self.set_dict = read_sets(kb_path, 'sets')
        else:
            self.set_dict = read_sets(set_path)
        self.place_holder_dict = read_sets(kb_path, 'place_holder')
        self.id_pattern_pairs = read_pattern(kb_path, 'ask_pattern',
                                             'intent_id', 'pattern')
        self.record_list = [
            convert2record_list(idpp, self.set_dict, self.place_holder_dict)
            for idpp in self.id_pattern_pairs
        ]

        self.keyword_dict = read_keywords(keywords_path)
        self.analyzer = han_analyzer()
        self.processor = Processor(init_stop_words_path=init_stop_words_path)
        self.base_structure = base_structure
        self.match_patterns = match_patterns
        self.get_intent_classes = get_intent_classes
        self.long_sentence_processor = Long_sentence_processor(
            init_stop_words_path)

    def get_dep_output_han(self, sentence):
        try:
            word_dict, word_objs = self.analyzer.dep_parse(sentence, False)
            res = [(w['LEMMA'], w['POSTAG'], w['DEPREL'], w['HEAD_LEMMA'])
                   for w in word_dict]
        except:
            print(sentence)
            res = None
        return res

    @staticmethod
    def find_levels(node):
        if node['level'] < 3:
            return True
        else:
            return False

    @staticmethod
    def find_levels2(node):
        if node['level'] < 5:
            return True
        else:
            return False

    @staticmethod
    def filter_and_rank(res_obj, find_level_func):
        #eles = [i['lemma'] for i in res_obj.loop_nodes(res_obj.dep_tree,find_level_func)]
        eles = [(i['id'], i['lemma'])
                for i in res_obj.loop_nodes(res_obj.dep_tree, find_level_func)]
        eles = sorted(eles, key=lambda x: x[0])
        eles = [x[1] for x in eles]
        return eles

    def match_one(self,
                  sentence,
                  deep_match=False,
                  match_intent=False,
                  topn=5):
        sentence = self.processor.check_and_remove_ini(sentence, self.analyzer,
                                                       False)
        res = self.base_structure(sentence, self.analyzer)
        #eles = [i['lemma'] for i in res.loop_nodes(res.dep_tree,self.find_levels)]
        eles = self.filter_and_rank(res, self.find_levels)
        #print(eles)
        #eles2 = [i['lemma'] for i in res.loop_nodes(res.dep_tree,self.find_levels2)]
        eles2 = self.filter_and_rank(res, self.find_levels2)
        #print(eles2)
        if len(eles) < 1:
            print('log: your input sentence is empty')
            return None

        intent_classes = None
        if match_intent:
            intent_classes = self.get_intent_classes(sentence)

        ## start matching
        ans = self.match_patterns(eles, self.record_list, self.keyword_dict,
                                  0.4, 0.7, match_intent, intent_classes)
        if ans and deep_match and len(eles2) > len(eles):
            print('log: level > 2 info used for match')
            ans = self.match_patterns(eles2, ans, self.keyword_dict, 0.3, 0.6)

        return ans[:topn]

    def match(self,
              sentence,
              deep_match=False,
              match_intent=False,
              topn=5,
              check_long_sentence=True):
        sentence = sentence.lower()
        if check_long_sentence:
            processed_inputs = self.long_sentence_processor.check_amd_split(
                sentence)
            logging.info(processed_inputs)
            if isinstance(processed_inputs, (list, )):
                res = {
                    'answer1':
                    self.match_one(processed_inputs[0],
                                   deep_match=deep_match,
                                   match_intent=match_intent,
                                   topn=topn),
                    'asnwer2':
                    self.match_one(processed_inputs[1],
                                   deep_match=deep_match,
                                   match_intent=match_intent,
                                   topn=topn)
                }
            else:
                res = {
                    'answer1':
                    self.match_one(processed_inputs,
                                   deep_match=deep_match,
                                   match_intent=match_intent,
                                   topn=topn),
                    'asnwer2':
                    None
                }
        else:
            res = {
                'answer1':
                self.match_one(processed_inputs,
                               deep_match=deep_match,
                               match_intent=match_intent,
                               topn=topn),
                'asnwer2':
                None
            }

        return res

    def evaluate_pattern(self,
                         sentence,
                         input_pattern,
                         deep_match=False,
                         match_intent=False):
        try:
            record = convert2record_list(
                ['input_id', input_pattern, 'NA', 'NA'], self.set_dict,
                self.place_holder_dict)
            #sentence = sentence.lower()
            sentence = self.processor.check_and_remove_ini(
                sentence, self.analyzer, False)
            #res = self.base_structure(sentence,self.analyzer)
            #eles = self.filter_and_rank(res,self.find_levels)
            #eles2 = self.filter_and_rank(res,self.find_levels2)
            ## for now, just use jeiba
            eles2 = list(jieba.cut(sentence))
            eles2 = [e.lower().replace(" ", "") for e in eles2]
            ans = self.match_patterns(eles2, [record],
                                      self.keyword_dict,
                                      0.0,
                                      0.0,
                                      match_intent,
                                      None,
                                      multi=False)

        except Exception as e:
            #logging.warning('Problem with input')
            logging.warning(e)
            return None

        if len(ans) > 0:
            return ans[0]
        else:
            return None