예제 #1
0
    def _prepare_data(self, files):
        print('prepare data...')

        embeddings = list()
        queries = list()
        queries_ = dict()
        labels = list()
        mlb = MultiLabelBinarizer()

        for index in xrange(len(files)):
            path = files[index]
            label = self.named_labels[index]
            queries_[label] = list()
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = QueryUtils.static_jieba_cut(question)
                    # print_cn(tokens)
                    if len(tokens) == 0:
                        continue
                    # cc=self.check_zero_tokens(tokens)
                    # if not cc:
                    #     continue
                    queries_[label].append(question)
        # print len(queries_)
        for label, questions in queries_.iteritems():
            for question in questions:
                if question in queries and label not in labels[queries.index(
                        question)]:
                    # print_cn(question)
                    index = queries.index(question)
                    labels[index].append(label)
                else:
                    # print_cn(question)
                    queries.append(question)
                    labels.append([label])
                    tokens = self.cut(question).split(' ')
                    embedding = self.get_w2v_emb(tokens)
                    embeddings.append(embedding)

        embeddings = np.array(embeddings)
        embeddings = np.squeeze(embeddings)
        self.mlb = mlb.fit(labels)
        labels = self.mlb.transform(labels)

        # print (embeddings.shape, len(queries))
        # print_cn(labels.shape)

        return embeddings, labels, queries
예제 #2
0
class QAKernel:

    qa_url = 'http://localhost:11403/solr/qa/select?wt=json&q=question:(%s)'

    # null_anwer = ['这个我不知道,您可以谷歌或百度', '我知识有限,这个我不知道怎么回答...[晕][晕][晕]']
    null_answer = ['null']

    def __init__(self):
        print('initilizing qa kernel...')
        self.qu = QueryUtils()

    def kernel(self, q):
        r = self._request_solr(q)
        answer = self._extract_answer(r)
        return answer

    def _extract_answer(self, r, random_range=1):
        try:
            num = self._num_answer(r)
            if num > 0:
                x = random.randint(0, min(random_range - 1, num - 1))
                response = self._get_response(r, x)
                return response
            else:
                return np.random.choice(self.null_anwer, 1)[0]
        except:
            return np.random.choice(self.null_anwer, 1)[0]

    def _request_solr(self, q):
        tokenized, exact_q = self.purify_q(q)
        url = self.qa_url % tokenized.decode('utf-8')
        print('qa_debug:', url)
        r = requests.get(url)
        return r

    def _num_answer(self, r):
        return int(r.json()["response"]["numFound"])

    def _get_response(self, r, i=0):
        try:
            a = r.json()["response"]["docs"]
            return a[i]["answer"][0].encode('utf8')
        except:
            return None

    def purify_q(self, q):
        q = self.qu.remove_cn_punct(q)
        pos_q = self.qu.corenlp_cut(q, remove_tags=["CD", "VA", "AD", "VC"])
        return ''.join(pos_q), q
예제 #3
0
    def _build_feature_extraction(self, data_path):
        print('Build feature extraction...')
        corpus = list()
        with open(data_path, 'r') as f:
            reader = csv.reader(f, delimiter='#')
            for line in reader:
                b = line[1].decode('utf-8')
                # b = QueryUtils.static_remove_stop_words(b)
                tokens = QueryUtils.static_jieba_cut(b)
                corpus.append(tokens)

        if self.mode == 'ngram':
            print_cn('Use {0}'.format(self.mode))
            bigram_vectorizer = CountVectorizer(
                ngram_range=(1, 2),
                min_df=0.0,
                max_df=1.0,
                analyzer='char',
                stop_words=[',', '?', '我', '我要'],
                binary=True)
            self.feature_extractor = bigram_vectorizer.fit(corpus)
        if self.mode == 'tfidf':
            print_cn('Use {0}'.format(self.mode))
            tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                               ngram_range=(1, 2),
                                               max_df=1.0,
                                               min_df=1,
                                               sublinear_tf=True)
            self.feature_extractor = tfidf_vectorizer.fit(corpus)
        if self.mode == 'fasttext':
            pass
예제 #4
0
    def where(self, q, last_r):
        current_entity, current_type, current_solr_r = self.retrieve_entity(
            q, 'store%20entertainment%20facility')
        if last_r and not current_entity:
            last_entity, last_type, last_solr_r = self.retrieve_entity(
                last_r, 'store%20entertainment%20facility')
        else:
            last_entity = None

        if current_entity:
            location = SolrUtils.get_dynamic_response(current_solr_r,
                                                      'rich_location',
                                                      random_field=True)
            if not location:
                return self.simple.kernel(q)
            return None, current_entity + "," + location
        if not last_entity:
            location = '您在问什么?'
            return None, location

        q = QueryUtils.static_remove_pu(q).decode('utf-8')
        strict = re.compile(ur'在哪|在什么地方|带我去|在哪里')
        if re.match(strict, q):
            location = SolrUtils.get_dynamic_response(last_solr_r,
                                                      'rich_location',
                                                      random_field=True)
            if not location:
                location = '数据库中不存在'
            return None, location
        return 'base', "数据库中不存在"
예제 #5
0
    def _build_feature_extractor(self, mode, files):
        print('Build feature extraction...')
        corpus = list()

        for path in files:
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = self.cut(question)
                    corpus.append(tokens)

        if mode == 'ngram':
            bigram_vectorizer = CountVectorizer(
                ngram_range=(1, 2),
                min_df=0.0,
                max_df=1.0,
                analyzer='char',
                stop_words=[',', '?', '我', '我要'],
                binary=True)
            self.feature_extractor = bigram_vectorizer.fit(corpus)
        if mode == 'tfidf':
            print_cn('use {0}'.format(mode))
            tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                               ngram_range=(1, 2),
                                               max_df=1.0,
                                               min_df=1,
                                               sublinear_tf=True)
            self.feature_extractor = tfidf_vectorizer.fit(corpus)
예제 #6
0
파일: gkernel.py 프로젝트: aquadrop/solr_py
    def __init__(self, graph_path, clf_path):
        # self.tokenizer = CoreNLP()
        self.graph = None
        self.gbdt = None
        self.state_cleared = True
        self._load_graph(graph_path)
        self._load_clf(clf_path)

        self.qu = QueryUtils()
예제 #7
0
 def _request_solr(self, q):
     ## cut q into tokens
     tokens = ['question:' + s for s in QueryUtils.static_jieba_cut(q, smart=False, remove_single=True)]
     q = ' OR '.join(tokens)
     url = self.qa_url.format(q)
     # print('qa_debug:', url)
     cn_util.print_cn(url)
     r = requests.get(url)
     return r
예제 #8
0
 def __init__(self):
     print('attaching qa kernel...')
     ## http://localhost:11403/solr/sc_qa/select?fq=entity:%E5%8E%95%E6%89%80&indent=on&q=*:*&wt=json
     self.qa_url = 'http://localhost:11403/solr/sc_qa/select?q.op=OR&wt=json&q={0}'
     self.qa_exact_match_url = 'http://localhost:11403/solr/sc_qa/select?wt=json&q=question:{0}'
     self.qu = QueryUtils()
     try:
         self.bt = BenebotSim.Instance()
     except:
         traceback.print_exc()
예제 #9
0
 def __init__(self, data_path):
     print('initilizing classifier...')
     self.data_path = data_path
     self.num_vol = 0
     self.vol = {}
     self.classes = {}
     self.index_classes = {}
     self.classes_num_sub = {}
     self.classifiers = {}
     self.qu = QueryUtils()
예제 #10
0
    def _build(self, data_path):
        self._build_feature_extraction(data_path)
        mlb = MultiLabelBinarizer()
        embeddings = list()
        labels = list()
        if self.mode == 'fasttext':
            with open(data_path, 'r') as f:
                reader = csv.reader(f, delimiter='#')
                for line in reader:
                    key = line[0].decode('utf-8')
                    input_ = line[1].decode('utf-8')
                    intention_list = key.split(",")
                    tokens = QueryUtils.static_jieba_cut(input_)
                    # embedding = self.feature_extractor.transform(tokens).toarray()
                    vector = self._fasttext_vector(tokens)
                    if not vector:
                        continue
                    embedding = vector
                    embeddings.append(embedding)
                    labels.append(intention_list)

            # embeddings = self.feature_extractor.transform(embeddings).toarray()
            self.mlb = mlb.fit(labels)
            labels_ = self.mlb.transform(labels)
            return embeddings, labels_
        else:
            with open(data_path, 'r') as f:
                reader = csv.reader(f, delimiter='#')
                for line in reader:
                    key = line[0].encode('utf-8')
                    input_ = line[1].encode('utf-8')
                    intention_list = key.split(",")
                    tokens = QueryUtils.static_jieba_cut(input_)
                    # embedding = self.feature_extractor.transform(tokens).toarray()
                    embeddings.append(tokens)
                    labels.append(intention_list)

            embeddings = self.feature_extractor.transform(embeddings).toarray()
            self.mlb = mlb.fit(labels)
            labels_ = self.mlb.transform(labels)
            return embeddings, labels_
예제 #11
0
 def select_max_match_with_sim(self, q, r):
     if not self.bt:
         return None
     matched_questions = SolrUtils.get_dynamic_response(r=r, key='question',
                                                        random_hit=False,
                                                        random_field=True,
                                                        keep_array=False,
                                                        facet=True)
     q_tokens = ' '.join(QueryUtils.static_jieba_cut(q))
     matched_questions_tokens = [' '.join(QueryUtils.static_jieba_cut(mqt)) for mqt in matched_questions]
     max_sim = self.bt.getMaxSim(q_tokens, matched_questions_tokens)
     best_sentence = ''.join(max_sim['sentence'].split(' '))
     sim = max_sim['sim']
     cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']')
     if sim > 0.3:
         index = matched_questions.index(best_sentence)
         answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index,
                                                 random_field=True,
                                                 random_hit=False)
         return answer
     return None
예제 #12
0
 def kernel(self, q, last_r):
     try:
         exact = self.exact_match(QueryUtils.static_remove_pu(q))
         if exact:
             return None, exact
         cls, probs = self.clf.predict(q)
         #
         if cls == 'where':
             direction, answer = self.where(q=q, last_r=last_r)
             return direction, answer
         if cls == 'exist':
             direction, answer = self.exist(q=q, last_r=last_r)
             return direction, answer
         if cls == 'ask_price':
             direction, answer = self.ask_price(q=q, last_r=last_r)
             return direction, answer
         if cls == 'ask_discount':
             direction, answer = self.ask_discount(q=q, last_r=last_r)
             return direction, answer
         if cls == 'ask_queue':
             direction, answer = self.ask_queue(q=q, last_r=last_r)
             return direction, answer
         # if cls == 'permit':
         #     direction, answer = self.permit(q=q, last_r=last_r)
         #     return direction, answer
         # if cls == 'whether':
         #     direction, answer = self.whether(q=q, last_r=last_r)
         #     return direction, answer
         # if cls == 'when':
         #     direction, answer = self.when(q=q, last_r=last_r)
         #     return direction, answer
         # if cls == 'how':
         #     direction, answer = self.how(q=q, last_r=last_r)
         #     return direction, answer
         # if cls == 'which':
         #     direction, answer = self.which(q=q, last_r=last_r)
         #     return direction, answer
         # if cls == 'what':
         #     direction, answer = self.what(q=q, last_r=last_r)
         #     return direction, answer
         if cls == 'list':
             direction, answer = self.list(q=q, last_r=last_r)
             return direction, answer
         if cls == 'ask_taste':
             direction, answer = self.taste(q=q, last_r=last_r)
             return direction, answer
         return self.simple.kernel(q)
     except Exception, e:
         traceback.print_exc()
         return self.simple.kernel(q)
예제 #13
0
 def _request_solr(self, q, key, base_url):
     ## cut q into tokens
     key = '%s:' % key
     tokens = [
         s for s in QueryUtils.static_jieba_cut(
             q, smart=False, remove_single=True)
     ]
     if len(tokens) == 0:
         return None
     q = key + "(" + '%20'.join(tokens) + ")"
     url = base_url % q
     cn_util.print_cn(url)
     r = requests.get(url)
     return r
예제 #14
0
 def _request_solr(self, q, key, base_url):
     try:
         ## cut q into tokens
         key = 'fq=%s:' % key
         tokens = [
             s for s in QueryUtils.static_jieba_cut(
                 q, smart=False, remove_single=True)
         ]
         q = key + '%20'.join(tokens)
         url = base_url.format(q)
         cn_util.print_cn(url)
         r = requests.get(url)
         return r
     except:
         traceback.print_exc()
         return None
예제 #15
0
    def regex_plugin(self, q):
        # q = QueryUtils.static_corenlp_cut(q, remove_tags=QueryUtils.remove_tags)

        if re.match(self.request_more_pattern, q):
            return 'reqmore', None, q
        if re.match(self.base_pattern, q):
            return 'base', None, q
        type_ = self.entity_recog(q)
        if type_ == 'store':
            _, neg = self.neg.predict(q)
            if neg:
                return 'base', None, q
            else:
                return 'qa', None, q

        # if type_ == 'item':
        #     return 'sale', 'qa', q

        try:
            if re.match(self.qa_pattern, q):
                # q = re.sub(self.qa_clean_pattern, '', q)
                return 'qa', None, q
            if re.match(self.sing_pattern, q):
                if re.match(self.sing_diff_pattern, q):
                    return 'qa', None, q
                return 'sing', None, q
            if re.match(self.sale_pattern, q):
                return 'sale', None, q
            if re.match(self.greeting_pattern, q):
                if (len(q)) > 1:
                    q = re.sub(self.greeting_clean_pattern, '', q)
                try:
                    q = QueryUtils.static_corenlp_cut(
                        q, remove_tags=QueryUtils.remove_tags)
                    q = ''.join(q).decode('utf-8')
                    if not q:
                        q = u'你好'
                except:
                    pass
                if isinstance(q, str):
                    q = q.decode('unicode-escape').encode('utf-8')
                return 'greeting', 'base', q
            return None, None, q
        except:
            return None, None, q
예제 #16
0
    def predict(self, input_):
        # input_ = QueryUtils.static_remove_stop_words(input_)
        tokens = QueryUtils.static_jieba_cut(input_)
        try:
            if self.mode == 'fasttext':
                embedding = self._fasttext_vector(tokens)
                embeddings = np.array([embedding])
            else:
                embeddings = np.reshape(
                    self.feature_extractor.transform(tokens).toarray()[0],
                    [1, -1])
        except:
            exit(-1)
        prediction = self.clf.predict(embeddings)
        prediction_index_first_sample = np.where(prediction[0] == 1)
        labels = self.mlb.inverse_transform(prediction)
        # print_cn(labels)
        probs = self.clf.predict_proba(embeddings)

        # note that in prediction stage, n_samples == 1
        return labels[0], probs[0][prediction_index_first_sample]
예제 #17
0
 def kernel(self, q):
     ## first try regex_plugin:
     scene, sugg_scene, q = self.regex_plugin(q)
     if scene:
         return scene, sugg_scene, q
     try:
         if not self.web:
             if not self.clf:
                 return 'sale', None, q
             q = QueryUtils.static_remove_pu(q)
             labels, _ = self.clf.predict(question=q)
             select = self.select_label(labels)
             ## qa plugin:
             if select == 'qa':
                 return select, 'sale', q
             if select == 'greeting':
                 return select, 'base', q
             return select, None, q
         else:
             text = requests.get('http://localhost:11305/sc/scene?q=' + q)
             return text.text, None, q
     except:
         return None, q
예제 #18
0
    def _prepare_data(self, files):
        print('prepare data...')

        embeddings = list()
        queries = list()
        labels = list()
        # mlb = MultiLabelBinarizer()

        for index in xrange(len(files)):
            path = files[index]
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    line = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8').split('#')
                    question = QueryUtils.static_simple_remove_punct(
                        str(line[0]))
                    label = self.named_labels.index(
                        str(line[1].encode('utf-8')))
                    queries.append(question)
                    labels.append(label)
                    tokens = [self.cut(question)]
                    embedding = self.feature_extractor.transform(
                        tokens).toarray()
                    embeddings.append(embedding)

        embeddings = np.array(embeddings)
        embeddings = np.squeeze(embeddings)
        # self.kernel.fit()
        # self.mlb = mlb.fit(labels)
        # labels = self.mlb.transform(labels)

        # print (embeddings.shape, len(queries))
        # print_cn(labels.shape)

        return embeddings, labels, queries
예제 #19
0
 def __init__(self):
     print('initilizing interactive kernel...')
     self.last_g = None
     self.qu = QueryUtils()
예제 #20
0
class IKernel:

    i_url = 'http://localhost:11403/solr/interactive/select?wt=json&q=g:(%s) OR exact_g:(%s)^4'
    simple_context_i_url = 'http://localhost:11403/solr/interactive/select?wt=json&q=g:(%s)^10 OR exact_g:(%s)^20 OR last_g:(%s)^2 OR exact_last_g:(%s)^8'

    # null_anwer = ['我没听懂您的意思', '我好像不明白...[晕][晕][晕]', '[晕][晕][晕]您能再说一遍吗?我刚刚没听清']
    null_answer = ['null']

    def __init__(self):
        print('initilizing interactive kernel...')
        self.last_g = None
        self.qu = QueryUtils()

    def kernel(self, q):
        r = self._request_solr(q)
        answer = self._extract_answer(r)
        return answer

    def _extract_answer(self, r, random_range=1):
        try:
            num = self._num_answer(r)
            if num > 0:
                x = random.randint(0, min(random_range - 1, num - 1))
                response = self._get_response(r, x)
                return response
            else:
                return np.random.choice(self.null_anwer, 1, p=[0.5, 0.5])[0]
        except:
            return np.random.choice(self.null_anwer, 1)[0]

    def _request_solr(self, q):
        tokenized, exact_q = self.purify_q(q)
        if not self.last_g:
            url = self.i_url % (tokenized, exact_q)
            self.last_g = q
        else:
            last_tkz, last_exact_q = self.purify_q(self.last_g)
            url = self.simple_context_i_url % (tokenized, exact_q, last_tkz,
                                               last_exact_q)
            self.last_g = q
        cn_util.print_cn('debug:interactive_url:' + url)
        r = requests.get(url)
        return r

    def clear_state(self):
        self.last_g = None

    def _num_answer(self, r):
        return int(r.json()["response"]["numFound"])

    def _get_response(self, r, i=0):
        try:
            a = r.json()["response"]["docs"][i]['b']
            x = random.randint(0, len(a) - 1)
            return a[x].encode('utf8')
        except:
            return None

    def purify_q(self, q):
        q = self.qu.remove_cn_punct(q)
        pos_q = self.qu.corenlp_cut(
            q, remove_tags=["CD", "PN", "VA", "AD", "VC", "SP"])
        return ''.join(pos_q), q
예제 #21
0
 def __init__(self):
     print('attaching sing kernel...')
     ## http://localhost:11403/solr/sc_qa/select?fq=entity:%E5%8E%95%E6%89%80&indent=on&q=*:*&wt=json
     self.qa_url = 'http://localhost:11403/solr/sc_music_kb/select?q.op=OR&wt=json&q=%s'
     self.qu = QueryUtils()
예제 #22
0
 def __init__(self):
     print('attaching greeting kernel...')
     self.last_g = None
     self.qu = QueryUtils()
     self.greeting_url = 'http://localhost:11403/solr/sc_greeting/select?q.op=OR&wt=json&q=question:(%s)'
     self.exact_greeting_url = 'http://localhost:11403/solr/sc_greeting/select?wt=json&q=exact_question:(%s)'
예제 #23
0
 def __init__(self):
     print('initilizing qa kernel...')
     self.qu = QueryUtils()
예제 #24
0
 def cut(self, input_):
     input_ = QueryUtils.static_remove_cn_punct(input_)
     tokens = jieba.cut(input_, cut_all=True)
     seg = " ".join(tokens)
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens
예제 #25
0
def cut(input_):
    input_ = QueryUtils.static_remove_cn_punct(input_)
    tokens = list(jieba.cut(input_, cut_all=False))
    return tokens
예제 #26
0
 def cut(self, input_):
     input_ = QueryUtils.static_simple_remove_punct(input_)
     seg = " ".join(jieba.cut(input_, cut_all=False))
     tokens = _uniout.unescape(str(seg), 'utf8')
     return tokens