示例#1
0
    def _build_feature_extraction(self, data_path):
        print('Build feature extraction...')
        corpus = list()
        with open(data_path, 'r') as f:
            reader = csv.reader(f, delimiter='#')
            for line in reader:
                b = line[1].decode('utf-8')
                # b = QueryUtils.static_remove_stop_words(b)
                tokens = QueryUtils.static_jieba_cut(b)
                corpus.append(tokens)

        if self.mode == 'ngram':
            print_cn('Use {0}'.format(self.mode))
            bigram_vectorizer = CountVectorizer(
                ngram_range=(1, 2),
                min_df=0.0,
                max_df=1.0,
                analyzer='char',
                stop_words=[',', '?', '我', '我要'],
                binary=True)
            self.feature_extractor = bigram_vectorizer.fit(corpus)
        if self.mode == 'tfidf':
            print_cn('Use {0}'.format(self.mode))
            tfidf_vectorizer = TfidfVectorizer(analyzer='char',
                                               ngram_range=(1, 2),
                                               max_df=1.0,
                                               min_df=1,
                                               sublinear_tf=True)
            self.feature_extractor = tfidf_vectorizer.fit(corpus)
        if self.mode == 'fasttext':
            pass
示例#2
0
 def _request_solr(self, q):
     ## cut q into tokens
     tokens = ['question:' + s for s in QueryUtils.static_jieba_cut(q, smart=False, remove_single=True)]
     q = ' OR '.join(tokens)
     url = self.qa_url.format(q)
     # print('qa_debug:', url)
     cn_util.print_cn(url)
     r = requests.get(url)
     return r
示例#3
0
    def _build(self, data_path):
        self._build_feature_extraction(data_path)
        mlb = MultiLabelBinarizer()
        embeddings = list()
        labels = list()
        if self.mode == 'fasttext':
            with open(data_path, 'r') as f:
                reader = csv.reader(f, delimiter='#')
                for line in reader:
                    key = line[0].decode('utf-8')
                    input_ = line[1].decode('utf-8')
                    intention_list = key.split(",")
                    tokens = QueryUtils.static_jieba_cut(input_)
                    # embedding = self.feature_extractor.transform(tokens).toarray()
                    vector = self._fasttext_vector(tokens)
                    if not vector:
                        continue
                    embedding = vector
                    embeddings.append(embedding)
                    labels.append(intention_list)

            # embeddings = self.feature_extractor.transform(embeddings).toarray()
            self.mlb = mlb.fit(labels)
            labels_ = self.mlb.transform(labels)
            return embeddings, labels_
        else:
            with open(data_path, 'r') as f:
                reader = csv.reader(f, delimiter='#')
                for line in reader:
                    key = line[0].encode('utf-8')
                    input_ = line[1].encode('utf-8')
                    intention_list = key.split(",")
                    tokens = QueryUtils.static_jieba_cut(input_)
                    # embedding = self.feature_extractor.transform(tokens).toarray()
                    embeddings.append(tokens)
                    labels.append(intention_list)

            embeddings = self.feature_extractor.transform(embeddings).toarray()
            self.mlb = mlb.fit(labels)
            labels_ = self.mlb.transform(labels)
            return embeddings, labels_
示例#4
0
 def select_max_match_with_sim(self, q, r):
     if not self.bt:
         return None
     matched_questions = SolrUtils.get_dynamic_response(r=r, key='question',
                                                        random_hit=False,
                                                        random_field=True,
                                                        keep_array=False,
                                                        facet=True)
     q_tokens = ' '.join(QueryUtils.static_jieba_cut(q))
     matched_questions_tokens = [' '.join(QueryUtils.static_jieba_cut(mqt)) for mqt in matched_questions]
     max_sim = self.bt.getMaxSim(q_tokens, matched_questions_tokens)
     best_sentence = ''.join(max_sim['sentence'].split(' '))
     sim = max_sim['sim']
     cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']')
     if sim > 0.3:
         index = matched_questions.index(best_sentence)
         answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index,
                                                 random_field=True,
                                                 random_hit=False)
         return answer
     return None
示例#5
0
    def _prepare_data(self, files):
        print('prepare data...')

        embeddings = list()
        queries = list()
        queries_ = dict()
        labels = list()
        mlb = MultiLabelBinarizer()

        for index in xrange(len(files)):
            path = files[index]
            label = self.named_labels[index]
            queries_[label] = list()
            with open(path, 'r') as f:
                for line in f:
                    # line = json.loads(line.strip().decode('utf-8'))
                    # question = line['question']
                    question = line.replace('\t', '').replace(
                        ' ', '').strip('\n').decode('utf-8')
                    question = QueryUtils.static_remove_cn_punct(str(question))
                    tokens = QueryUtils.static_jieba_cut(question)
                    # print_cn(tokens)
                    if len(tokens) == 0:
                        continue
                    # cc=self.check_zero_tokens(tokens)
                    # if not cc:
                    #     continue
                    queries_[label].append(question)
        # print len(queries_)
        for label, questions in queries_.iteritems():
            for question in questions:
                if question in queries and label not in labels[queries.index(
                        question)]:
                    # print_cn(question)
                    index = queries.index(question)
                    labels[index].append(label)
                else:
                    # print_cn(question)
                    queries.append(question)
                    labels.append([label])
                    tokens = self.cut(question).split(' ')
                    embedding = self.get_w2v_emb(tokens)
                    embeddings.append(embedding)

        embeddings = np.array(embeddings)
        embeddings = np.squeeze(embeddings)
        self.mlb = mlb.fit(labels)
        labels = self.mlb.transform(labels)

        # print (embeddings.shape, len(queries))
        # print_cn(labels.shape)

        return embeddings, labels, queries
示例#6
0
 def _request_solr(self, q, key, base_url):
     ## cut q into tokens
     key = '%s:' % key
     tokens = [
         s for s in QueryUtils.static_jieba_cut(
             q, smart=False, remove_single=True)
     ]
     if len(tokens) == 0:
         return None
     q = key + "(" + '%20'.join(tokens) + ")"
     url = base_url % q
     cn_util.print_cn(url)
     r = requests.get(url)
     return r
示例#7
0
 def _request_solr(self, q, key, base_url):
     try:
         ## cut q into tokens
         key = 'fq=%s:' % key
         tokens = [
             s for s in QueryUtils.static_jieba_cut(
                 q, smart=False, remove_single=True)
         ]
         q = key + '%20'.join(tokens)
         url = base_url.format(q)
         cn_util.print_cn(url)
         r = requests.get(url)
         return r
     except:
         traceback.print_exc()
         return None
示例#8
0
    def predict(self, input_):
        # input_ = QueryUtils.static_remove_stop_words(input_)
        tokens = QueryUtils.static_jieba_cut(input_)
        try:
            if self.mode == 'fasttext':
                embedding = self._fasttext_vector(tokens)
                embeddings = np.array([embedding])
            else:
                embeddings = np.reshape(
                    self.feature_extractor.transform(tokens).toarray()[0],
                    [1, -1])
        except:
            exit(-1)
        prediction = self.clf.predict(embeddings)
        prediction_index_first_sample = np.where(prediction[0] == 1)
        labels = self.mlb.inverse_transform(prediction)
        # print_cn(labels)
        probs = self.clf.predict_proba(embeddings)

        # note that in prediction stage, n_samples == 1
        return labels[0], probs[0][prediction_index_first_sample]