示例#1
0
 def get_index_entities_regex(self, entities):
     """将实体列表转换为正则表达式
     """
     ret = {}
     for x in entities:
         assert 'entity' in x and isinstance(x['entity'], str), \
             '实体必须有entity属性且为字符串类型'
         data = []
         for item in x['data']:
             if isinstance(item, str):
                 data.append(item)
             elif isinstance(item, list):
                 for iitem in item:
                     if isinstance(iitem, str):
                         data.append(iitem)
         if len(data) > LIMIT:
             data = shuffle(data, random_state=0)
             data = data[:LIMIT]
         data = [
             clean_re(x)
             for x in data
         ]
         r = '(?:' + '|'.join(data) + ')'
         if 'regex' in x:
             r += '|(?:' + x['regex'] + ')'
         regex = '\\s*(?:' + r + ')\\s*'
         ret[x['entity']] = regex
         LOG.debug('entity: %s regex: %s', x['entity'], regex)
     return ret
示例#2
0
def unit_test():
    """unit test"""

    from nlu.utils.data_loader import load_nlu_data
    from nlu.utils.data_iob import data_to_iob
    intents, entities = load_nlu_data('nlu_data')
    # intents = [x for x in intents if x['intent'] == 'lottery_inform']
    # print(intents)
    sentence_result, slot_result, _ = data_to_iob(intents, entities)

    # print(max([len(x) for x in sentence_result]))

    NeuralSlotFiller.cv_eval(sentence_result, slot_result, cv=5)
    exit(0)

    eng = NeuralSlotFiller()
    eng.fit(sentence_result, slot_result)

    LOG.debug('crf fitted')

    metrics = eng.eval(sentence_result, slot_result, progress=True)
    for k, v in metrics.items():
        print(k, v)

    # acc, bad = eng.exact_eval(sentence_result, slot_result)
    # print('exact acc', acc)
    # print('bad count', len(bad))

    print(eng.predict([list('我要买第18138期')]))
示例#3
0
 def _get_iob(iob):
     """load iob only once"""
     if iob[0] is None:
         LOG.info('build IOB data')
         (sentence_result, slot_result,
          domain_result) = data_to_iob(intents, entities)
         iob = sentence_result, slot_result, domain_result
     return iob
示例#4
0
    def __init__(self, intent, index_entities_regex):
        """初始化"""

        assert isinstance(intent['intent'], str), '错误的意图'
        assert intent['intent'].strip(), '意图不能为空'
        self.intent = intent['intent'].strip()
        self.domain = None \
            if not isinstance(intent['domain'], str) or \
                len(intent['domain'].strip()) <= 0 \
            else intent['domain'].strip()
        self.data = intent['data']

        slot_index = {}
        def _replace(part):
            """转换部分句子结构,如果这个部分是实体,就返回正则表达式,如果是普通文本,就返回文本"""
            if 'name' in part:
                slot_name = part['name']
                if slot_name not in slot_index:
                    slot_index[slot_name] = 0
                slot_index[slot_name] += 1
                temp = '(?P<{slot_name}{splitor}{index}>{slot_regex})'
                if slot_name in index_entities_regex:
                    return temp.format(
                        slot_name=slot_name,
                        splitor=self.slot_name_splitor,
                        index=slot_index[slot_name],
                        slot_regex=index_entities_regex[slot_name])
                # else:
                return temp.format(
                    slot_name=slot_name,
                    slot_regex=clean_re(part['text']))
            text = part['text']
            place = []
            def choice(x):
                x = x.group(1).split('|')
                place_id = '__place__{}__'.format(len(place))
                place.append((place_id, '(?:' + '|'.join([clean_re(xx) for xx in x]) + ')'))
                return place_id
            text = re.sub(
                r'\[\[([^\]]+)\]\]',
                choice,
                text)
            text = clean_re(text)
            for k, v in place:
                text = text.replace(k, v)
            return text

        self.patten = re.compile(
            '^' + \
            ''.join([_replace(x) for x in self.data]) + \
            '$')
        LOG.debug('pattens: %s', self.patten)
示例#5
0
def load_nlu_data(data_dir):
    """读取NLU数据目录的信息
    目录中应该有intents与entities子目录,分别保存意图和实体信息,为yaml格式
    """
    assert os.path.exists(data_dir), '数据目录“{}”不存在'.format(data_dir)

    paths = []
    for dirname, _, filenames in os.walk(data_dir):
        filenames = [x for x in filenames if x.endswith('.yml')]
        for filename in filenames:
            path = os.path.join(dirname, filename)
            paths.append(path)

    assert paths, '找不到yaml数据文件,注意要以“.yml”后缀名结尾'

    entities = []
    intents = []

    for path in paths:
        with open(path, 'r') as fp:
            try:
                objs = yaml.load(fp)
            except:
                raise Exception('数据读取错误,可能不是合法YAML文件 “{}”'.format(path))
            assert isinstance(objs, (list, tuple)), \
                '数据文件必须是list or tuple “{}”'.format(path)

            for obj in objs:
                if isinstance(obj, dict):
                    if 'intent' in obj:
                        assert 'data' in obj, '意图必须包括“data”属性 “{}”'.format(
                            path)
                        assert isinstance(obj['data'], (list, tuple)) \
                            and obj['data'], \
                                '意图必须包括“data”且长度大于0 “{}”'.format(path)
                        intents.append(obj)
                    elif 'entity' in obj:
                        assert 'data' in obj, \
                            '实体必须包括“data”属性 “{}”'.format(path)
                        assert 'copyFrom' in obj \
                            or (isinstance(obj['data'], (list, tuple)) \
                            and obj['data']), \
                                '有copyFrom,或者有“data”且长度大于0 “{}”'.format(path)
                        entities.append(obj)

    entities = entity_merge(entities)

    LOG.debug('读取到了 %s 个intent, %s 个entity', len(intents), len(entities))

    return intents, entities
示例#6
0
    def fit(self,
            sentence_result, domain_result,
            feature='tfidf',
            algorithm='LinearSVC'):
        """fit model"""

        LOG.debug('fit MLIntentClassifier')
        (
            model_intent, model_domain,
            x_train, y_train_intent, y_train_domain
        ) = self.build_model(
            sentence_result, domain_result,
            feature, algorithm)
        self.model_intent, self.model_domain = model_intent, model_domain
        self.model_intent.fit(x_train, y_train_intent)
        self.model_domain.fit(x_train, y_train_domain)
示例#7
0
文件: app.py 项目: lijiarui/smp-nlu
def load_models(model_dir='./tmp/nlu_model'):
    """加载模型"""
    config_path = os.path.join(model_dir, 'config.json')

    if not os.path.exists(config_path):
        LOG.error('config_path not exsits "%s"', config_path)
        exit(1)
    pipeline_config = json.load(open(config_path))
    models = []
    for model_name in pipeline_config:
        model = pickle.load(
            open(os.path.join(
                model_dir,
                '{}.pkl'.format(model_name)), 'rb'))
        models.append((model_name, model))
    return models
示例#8
0
    def predict_slot(self, nlu_obj):
        """识别实体"""
        tokens = nlu_obj['tokens']
        tokens = [x.lower() for x in tokens]
        ret = self.predict([tokens])
        LOG.debug('crf_slot_filler raw %s', ret)
        crf_ret = get_slots_detail(nlu_obj['tokens'], ret[0])
        nlu_obj['crf_slot_filler'] = {'slots': crf_ret}
        for slot in crf_ret:
            slot['from'] = 'crf_slot_filler'
        if len(nlu_obj['slots']) <= 0:
            nlu_obj['slots'] = crf_ret
        else:
            for slot in crf_ret:
                is_include = False
                for s in nlu_obj['slots']:
                    if slot['pos'][0] >= s['pos'][0] \
                    and slot['pos'][0] <= s['pos'][1]:
                        is_include = True
                        break
                    elif slot['pos'][1] >= s['pos'][0] \
                    and slot['pos'][1] <= s['pos'][1]:
                        is_include = True
                        break
                    elif s['pos'][0] >= slot['pos'][0] \
                    and s['pos'][0] <= slot['pos'][1]:
                        is_include = True
                        break
                    elif s['pos'][1] >= slot['pos'][0] \
                    and s['pos'][1] <= slot['pos'][1]:
                        is_include = True
                        break
                if not is_include:
                    nlu_obj['slots'].append(slot)
                    nlu_obj['slots'] = sorted(nlu_obj['slots'],
                                              key=lambda x: x['pos'][0])

        return nlu_obj
示例#9
0
def data_to_iob(intents, entities):
    """把数据转换为IOB格式
    Inside-outside-beginning"""

    np.random.seed(0)

    index_entities_data = get_index_entities_data(entities)
    keys = sorted([(k, len(v)) for k, v in index_entities_data.items()],
                  key=lambda x: x[1])
    for k, v in keys:
        LOG.debug('kv %s %s', k, v)

    slot_count = {}
    for intent in intents:
        for item in intent['data']:
            if 'name' in item:
                slot_name = item['name']
                if slot_name not in slot_count:
                    slot_count[slot_name] = 0
                slot_count[slot_name] += 1

    sentence_result, slot_result, domain_result = [], [], []

    LOG.debug('parallel job %s', len(intents))
    ret = Parallel(n_jobs=8, verbose=6)(
        delayed(convert_item)(intent, index_entities_data, slot_count)
        for intent in intents)

    LOG.debug('parallel job done')

    for r1, r2, r3 in ret:
        sentence_result += r1
        slot_result += r2
        domain_result += r3

    with open('/tmp/nlu_iob.txt', 'w') as fp:
        for a, b, c in zip(sentence_result, slot_result, domain_result):
            fp.write('\t'.join(a) + '\n')
            fp.write('\t'.join(b) + '\n')
            fp.write(c + '\n')
            fp.write('\n')

    LOG.debug('return IOB data')
    return sentence_result, slot_result, domain_result
示例#10
0
    def build_model(self, sentence_result, domain_result, feature, algorithm):
        """构建模型"""

        self.build_vectorizer(feature)

        x_text = [
            ''.join(x).lower() for x in sentence_result
        ]

        y_class_domain = [
            x.split(SPLITOR)[0]
            for x in domain_result
        ]

        y_class_intent = [
            x.split(SPLITOR)[1]
            for x in domain_result
        ]

        with open('/tmp/ml_intent_classifier.tmp', 'w') as fp:
            for x, y in zip(x_text, y_class_intent):
                fp.write('{}\t{}\n'.format(x, y))

        x_train = self.vectorizer.fit_transform(x_text)

        intent_class_index = {}
        intent_index_class = {}
        for i, c in enumerate(sorted(list(set(y_class_intent)))):
            intent_class_index[c] = i
            intent_index_class[i] = c
        self.intent_class_index = intent_class_index
        self.intent_index_class = intent_index_class

        LOG.debug('ml_intent_classifier intent class %s',
                  len(intent_class_index))

        y_train_intent = [self.intent_class_index[x.split(SPLITOR)[1]]
                          for x in domain_result]

        domain_class_index = {}
        domain_index_class = {}
        for i, c in enumerate(sorted(list(set(y_class_domain)))):
            domain_class_index[c] = i
            domain_index_class[i] = c
        self.domain_class_index = domain_class_index
        self.domain_index_class = domain_index_class

        LOG.debug('ml_intent_classifier domain class %s',
                  len(domain_class_index))

        y_train_domain = [self.domain_class_index[x.split(SPLITOR)[0]]
                          for x in domain_result]

        model_intent = None
        model_domain = None
        if algorithm == 'RandomForest':
            model_intent, model_domain = [RandomForestClassifier(
                random_state=0,
                class_weight='balanced', n_jobs=-1) for _ in range(2)]
        elif algorithm == 'SVC':
            model_intent, model_domain = [SVC(
                random_state=0,
                probability=True, class_weight='balanced') for _ in range(2)]
        elif algorithm == 'LinearSVC':
            model_intent, model_domain = [LinearSVC(
                random_state=0,
                class_weight='balanced') for _ in range(2)]
        else:
            raise Exception('Unknown algorithm "{}"'.format(algorithm))

        return (model_intent, model_domain,
                x_train, y_train_intent, y_train_domain)
示例#11
0
def build_model(nlu_data, model_dir, pipline_config):
    """构建模型"""
    models = []

    LOG.info('start build')

    intents, entities = load_nlu_data(nlu_data)
    iob = [None, None, None]

    def _get_iob(iob):
        """load iob only once"""
        if iob[0] is None:
            LOG.info('build IOB data')
            (sentence_result, slot_result,
             domain_result) = data_to_iob(intents, entities)
            iob = sentence_result, slot_result, domain_result
        return iob

    for item in pipline_config:
        LOG.info('train "%s"', item)

        if item == 'regex_engine':
            reng = RegexEngine(intents, entities)
            models.append(('regex_engine', reng))

        elif item == 'ml_intent_classifier':
            ml_intent = MLIntentClassifier()
            iob = _get_iob(iob)
            sentence_result, _, domain_result = iob
            ml_intent.fit(sentence_result, domain_result)
            models.append(('ml_intent_classifier', ml_intent))

        elif item == 'dl_intent_classifier':
            dl_intent = DLIntentClassifier()
            iob = _get_iob(iob)
            sentence_result, _, domain_result = iob
            dl_intent.fit(sentence_result, domain_result)
            models.append(('dl_intent_classifier', dl_intent))

        elif item == 'crf_slot_filler':
            crf_slot = CRFSlotFiller()
            iob = _get_iob(iob)
            sentence_result, slot_result, _ = iob
            crf_slot.fit(sentence_result, slot_result)
            models.append(('crf_slot_filler', crf_slot))

        elif item == 'neural_slot_filler':
            crf_slot = NeuralSlotFiller()
            iob = _get_iob(iob)
            sentence_result, slot_result, _ = iob
            crf_slot.fit(sentence_result, slot_result)
            models.append(('neural_slot_filler', crf_slot))

        elif item == 'neural_intent_classifier_slot_filler':
            nicsf = NeuralIntentClassifierSlotFiller()
            iob = _get_iob(iob)
            sentence_result, slot_result, domain_result = iob
            y_data = list(zip(slot_result, domain_result))
            nicsf.fit(sentence_result, y_data)
            models.append(('neural_intent_classifier_slot_filler', nicsf))

        else:
            LOG.error('invalid engine "%s"', item)
            raise Exception('invalid engine "%s"' % item)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    config_path = os.path.join(model_dir, 'config.json')
    with open(config_path, 'w') as fp:
        json.dump(pipline_config, fp, indent=4, ensure_ascii=False)

    for model_name, model in models:
        model_path = os.path.join(model_dir, '{}.pkl'.format(model_name))
        with open(model_path, 'wb') as fp:
            pickle.dump(model, fp)

    LOG.info('train and saved')
示例#12
0
    def fit(self,
            sentence_result,
            slot_result,
            max_iterations=100,
            c1=0.17,
            c2=0.01):
        """fit model"""

        self.c1 = c1
        self.c2 = c2
        self.max_iterations = max_iterations

        LOG.debug('fit CRFSlotFiller')

        x_train = sentences_to_features(sentence_result)
        y_train = slot_result
        labels = set()
        for x in slot_result:
            labels.update(x)
        labels = sorted(list(labels))
        labels.remove('O')
        LOG.debug('labels: %s', ', '.join(labels))
        self.labels = labels

        LOG.debug('CRFSlotFiller try write tmp train data')
        with open('/tmp/crf_slot_filler.tmp', 'w') as fp:
            for x, y in zip(sentence_result, slot_result):
                line = []
                for i, x_i in enumerate(x):
                    line.append('{}\t{}'.format(x_i, y[i]))
                fp.write('\n'.join(line) + '\n\n')
        LOG.debug('CRFSlotFiller try write tmp train data done')

        LOG.debug('x_train %d, y_train %d', len(x_train), len(y_train))

        if os.environ.get('CRF') == 'search':
            crf = CRF(algorithm='lbfgs',
                      max_iterations=50,
                      all_possible_transitions=True)
            params_space = {
                'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05),
            }
            f1_score = make_scorer(metrics.flat_f1_score,
                                   average='weighted',
                                   labels=labels)
            rs = RandomizedSearchCV(crf,
                                    params_space,
                                    cv=3,
                                    verbose=1,
                                    n_jobs=2,
                                    n_iter=8 * 8,
                                    scoring=f1_score)
            rs.fit(x_train, y_train)
            LOG.debug('best params: %s', rs.best_params_)
            LOG.debug('best cv score: %s', rs.best_score_)
            self.crf = rs.best_estimator_
        else:

            crf = CRF(algorithm='lbfgs',
                      c1=c1,
                      c2=c2,
                      max_iterations=max_iterations,
                      all_possible_transitions=True)
            for x, y in zip(x_train, y_train):
                assert len(x) == len(y), '"{}", "{}" diff'.format(
                    str([xx['token'] for xx in x]), str(y))
            crf.fit(x_train, y_train)

            self.crf = crf
示例#13
0
文件: app.py 项目: lijiarui/smp-nlu
def web_parse(sentence=None):
    """提供NLU服务"""
    if sentence is None:
        return jsonify(success=False, message='sentence is None')

    nlu_obj = {
        'intent': None,
        'domain': None,
        'slots': [],
        'text': sentence,
        'tokens': list(sentence),
    }

    start_time = time.time()

    LOG.debug('start %s models', len(MODELS))
    for model_name, model in MODELS:
        LOG.debug('through %s model %s', model_name, time.time() - start_time)
        if model.domain_implement:
            LOG.debug('through %s model predict_domain %s', model_name, time.time() - start_time)
            nlu_obj = model.predict_domain(nlu_obj)
        if model.intent_implement:
            LOG.debug('through %s model predict_intent %s', model_name, time.time() - start_time)
            nlu_obj = model.predict_intent(nlu_obj)
        if model.slot_implement:
            LOG.debug('through %s model predict_slot %s', model_name, time.time() - start_time)
            nlu_obj = model.predict_slot(nlu_obj)

    # print(nlu_obj)
    LOG.debug('return %s', time.time() - start_time)
    return APP.response_class(
        response=simplejson.dumps({
            'success': True,
            'result': nlu_obj
        }, indent=4, ensure_ascii=False),
        status=200,
        mimetype='application/json'
    )