Пример #1
0
def train_dst(x_dst, y_dst):
    """Train DST."""
    dst = DialogStateTracker()
    dst.fit(x_dst, y_dst)

    logger.info('\n' + '-' * 30)
    logger.info('DST: trained')
    return dst
Пример #2
0
def train_dpl(x_dpl, y_dpl):
    """Train DPL."""
    dpl = DialogPolicyLearning()
    dpl.fit(x_dpl, y_dpl)

    logger.info('\n' + '-' * 30)
    logger.info('DPL: trained')
    return dpl
Пример #3
0
def load_nlu_data(data_path):
    """Load NLU data from dir.

    目录中应该有intents与entities子目录,分别保存意图和实体信息,为yaml格式
    """
    assert os.path.exists(data_path), '数据“{}”不存在'.format(data_path)

    # paths = []
    # for dirname, _, filenames in os.walk(data_dir):
    #     filenames = [x for x in filenames if x.endswith('.yml')]
    #     for filename in filenames:
    #         path = os.path.join(dirname, filename)
    #         paths.append(path)

    # assert paths, '找不到yaml数据文件,注意要以“.yml”后缀名结尾'

    entities = []
    intents = []

    # for path in paths:
    with open(data_path, 'r') as fp:
        try:
            obj = yaml.load(fp, Loader=Loader)
        except:  # noqa
            raise Exception('数据读取错误,可能不是合法YAML文件 “{}”'.format(data_path))
        assert 'nlu' in obj
        objs = obj.get('nlu')
        assert isinstance(objs, (list, tuple)), \
            '数据文件必须是list or tuple “{}”'.format(data_path)

        for obj in objs:
            if isinstance(obj, dict):
                if 'intent' in obj:
                    assert 'data' in obj, \
                        '意图必须包括“data”属性 “{}”'.format(data_path)
                    assert isinstance(obj['data'], (list, tuple)) \
                        and obj['data'], \
                        '意图必须包括“data”且长度大于0 “{}”'.format(data_path)
                    intents.append(obj)
                elif 'entity' in obj:
                    assert 'data' in obj, \
                        '实体必须包括“data”属性 “{}”'.format(data_path)
                    assert 'copyFrom' in obj or (
                        isinstance(obj['data'], (list, tuple)) and obj['data']
                    ), '有copyFrom,或者有“data”且长度大于0 “{}”'.format(data_path)
                    entities.append(obj)

    entities = entity_merge(entities)

    logger.info('读取到了 %s 个intent, %s 个entity', len(intents), len(entities))

    return intents, entities
Пример #4
0
def data_to_iob(intents, entities):
    """Convert Data to IOB Format.

    把数据转换为IOB格式
    Inside-outside-beginning
    """
    np.random.seed(0)
    index_entities_data = get_index_entities_data(entities)

    slot_count = {}
    for intent in intents:
        for item in intent['data']:
            if 'name' in item:
                slot_name = item['name']
                if slot_name not in slot_count:
                    slot_count[slot_name] = 0
                slot_count[slot_name] += 1

    sentence_result, slot_result, domain_result, intent_result = [], [], [], []

    logger.info(f'parallel job %s', len(intents))
    ret = Parallel(n_jobs=-1, verbose=6)(
        delayed(convert_item)(intent, index_entities_data, slot_count)
        for intent in intents)

    logger.info('parallel job done')

    for r1, r2, r3, r4 in ret:
        sentence_result += r1
        slot_result += r2
        domain_result += r3
        intent_result += r4

    logger.info('return IOB data')
    return sentence_result, slot_result, domain_result, intent_result
Пример #5
0
def main(data_path, model_path, outside_function={}, n_history=3, n_times=50):
    """Train Entrance.

    训练模型
    Args:
        data_path: 数据输入目录
        model_path: 模型输出目录
        outside_function: 外部NLG函数
        n_history: 历史保存轮次
        n_times: 都个story组合轮次
    """
    faq = FrequentlyAskedQuestions()
    faq.fit(data_path)

    # NLU Part
    if len(faq):
        nlu = train_nlu(data_path, faq.questions)
    else:
        nlu = train_nlu(data_path)

    # NLG Part

    nlg = train_nlg(data_path)

    logger.info('\n' + '-' * 30)
    logger.info('NLG Intents:')
    logger.info('\n'.join(nlg.intent_list))

    # story_path = os.path.join(data_path, 'story')
    # parse_story会返回下面这些
    # {
    #     'dialog': dialogs,
    #     'user_intent': user_intent_list,
    #     'user_domain': user_domain_list,
    #     'user_slot': user_slot_list,
    #     'sys_intent': sys_intent_list,
    #     'sys_slot': sys_slot_list,
    # }
    stories = parse_story(data_path)
    # 检查NLU和stories是否冲突
    for ud in stories['user_domain']:
        assert ud in nlu.domain_list, 'user domain {} not in NLU'.format(ud)
    for ui in stories['user_intent']:
        assert ui in nlu.intent_list, 'user intent {} not in NLU'.format(ui)
    for us in stories['user_slot']:
        assert us in nlu.slot_list, 'user slot {} not in NLU'.format(us)
    for si in stories['sys_intent']:
        assert si in nlg.intent_list, 'sys intent {} not in NLG'.format(si)

    init_state = make_init_state(stories)
    (x_dst, y_dst, x_dpl, y_dpl) = build_dialog_train_data(stories['dialog'],
                                                           init_state,
                                                           n_history=n_history,
                                                           n_times=n_times)

    # DST Part

    dst = train_dst(x_dst, y_dst)

    # DPL Part
    dpl = train_dpl(x_dpl, y_dpl)

    data = {
        'init_state': init_state,
        'faq': faq,
        'nlu': nlu,
        'nlg': nlg,
        'dst': dst,
        'dpl': dpl,
    }

    model_path_dir = os.path.dirname(model_path)
    mkdir(model_path_dir)
    with open(model_path, 'wb') as fp:
        pickle.dump(data, fp)

    logger.info('\n')
    logger.info('Train done')
Пример #6
0
def train_nlu(data_path, faq_questions=None):
    """Train NLU."""
    nlu_path = os.path.join(data_path, 'nlu')
    assert os.path.exists(nlu_path), 'Invalid NLU data path'
    logger.info('Start train NLU')
    nlu = NaturalLanguageUnderstanding()
    nlu.fit(nlu_path, faq_questions)

    logger.info('\n' + '-' * 30)
    logger.info('NLU Intents:')
    logger.info('\n'.join(nlu.intent_list))
    logger.info('NLU Slots')
    logger.info(nlu.slot_list)

    return nlu
Пример #7
0
    def fit(self, data_path, faq_questions=None):
        """Fit NLU Module.

        先从目录转换出所有的yml文件
        然后得到四个列表,分别是句子本身,槽,领域,意图,他们的长度是相等的
        例如一条句子:
        sentences: [ ['我', '爱', '你'] ]
        slots: [ 'O', 'O', 'O' ]
        domains: [ 'life' ]
        intents: [ 'ask_love' ]
        """
        raw_intents, raw_entities = load_nlu_data(data_path)
        sentences, slots, domains, intents = data_to_iob(
            raw_intents, raw_entities)

        slot_list = []
        for slot in slots:
            for s in slot:
                if s.startswith('B_'):
                    if s[2:] not in slot_list:
                        slot_list.append(s[2:])
        self.slot_list = sorted(set(slot_list))

        # 处理特殊的FAQ意图
        if faq_questions is None:
            faq_questions = []
        else:
            faq_questions = [list(x) for x in faq_questions] * 10
            # TODO 一个不太好的超参,控制FAQ和其他对话的训练比例

        # import pdb; pdb.set_trace()
        # Entity as B, I as * 2 + Outer
        self.ner_slot = NERSlotFiller(len(self.slot_list) * 2 + 1)

        self.ner_slot.fit(sentences, slots)

        slot_accuracy, _ = self.ner_slot.eval(sentences, slots)

        self.intent_label = LabelBinarizer()
        self.intent_label.fit(intents + [FAQ_INTENT])

        self.domain_label = LabelBinarizer()
        self.domain_label.fit(domains + [FAQ_INTENT])

        self.intent_list = self.intent_label.classes_.tolist()
        self.domain_list = self.domain_label.classes_.tolist()

        self.tokenizer = Tokenizer(num_words=self.vocab_size, char_level=True)
        self.tokenizer.fit_on_texts(sentences + faq_questions)

        seq = self.tokenizer.texts_to_sequences(sentences + faq_questions)
        seq_pad = pad_sequences(seq, maxlen=self.maxlen)

        self.intent_clr = get_model(self.intent_label.classes_.shape[0],
                                    n_vocab=self.vocab_size)
        self.domain_clr = get_model(self.domain_label.classes_.shape[0],
                                    n_vocab=self.vocab_size)

        y_intent = self.intent_label.transform(intents + [FAQ_INTENT] *
                                               len(faq_questions))
        y_domain = self.domain_label.transform(domains + [FAQ_INTENT] *
                                               len(faq_questions))
        if 1 == y_domain.shape[1]:
            y_domain = to_categorical(y_domain, 2)

        self.intent_clr.fit(seq_pad, y_intent)
        self.domain_clr.fit(seq_pad, y_domain)

        loop = tqdm(zip(sentences, domains, intents), total=len(sentences))

        domain_ret = self.domain_label.inverse_transform(
            self.domain_clr.predict_proba(seq_pad))
        intent_ret = self.intent_label.inverse_transform(
            self.intent_clr.predict_proba(seq_pad))

        ret = []
        for (a, b, c), dr, ir in zip(loop, domain_ret, intent_ret):
            ret.append((b == dr, c == ir))
        domain_accuracy, intent_accuracy = (np.sum([x[0] for x in ret]) /
                                            len(sentences),
                                            np.sum([x[1] for x in ret]) /
                                            len(sentences))
        logger.info(
            'domain_accuracy: %s\n' + 'intent_accuracy: %s\n' +
            'slot_accuracy: %s\n', domain_accuracy, intent_accuracy,
            slot_accuracy)
def build_dialog_train_data(dialogs, init_state, n_history, n_times):
    """Generate Trainning Data, include DST and DPL."""
    x_dst, y_dst = [], []
    x_dpl, y_dpl = [], []

    dialog_queue = []
    for i in range(len(dialogs) * 1000):
        dialog_queue.append('clean')
        for i in range(5):
            dialog_queue.append(random.choice(dialogs))
    logger.info('dialog_queue length %s', len(dialog_queue))

    history = []
    for i in range(n_history):
        history.append(init_state.clone())

    for dialog in dialog_queue:
        if dialog == 'clean':
            history = []
            for i in range(n_history):
                history.append(init_state.clone())
            continue
        for turn in dialog:
            state = history[-1].clone()  # last history
            if 'user' in turn:
                new_state = make_new_state(
                    init_state,
                    turn['domain'],
                    turn['intent'],
                    turn['slots']
                )

                slot_vec = state.slot_vec
                new_slot_vec = new_state.slot_vec
                y = np.array([
                    1. if a != b else 0.
                    for a, b in zip(
                        slot_vec.tolist(), new_slot_vec.tolist())
                ])

                x = np.array([
                    s.vec
                    for s in history
                ] + [new_state.vec])
                x_dst.append(x)
                y_dst.append(y)
                if np.sum(y) > 0:
                    for i in range(10):
                        x_dst.append(x)
                        y_dst.append(y)

                history = history[1:] + [new_state]
            if 'sys' in turn:
                x = np.array([s.vec for s in history])
                state.sys_intent = turn['intent']  # 设置为了获取y向量,不影响history
                y = state.sys_vec
                x_dpl.append(x)
                y_dpl.append(y)
                history[-1].sys_intent = turn['intent']  # 设置history

    x_dst = np.array(x_dst)
    y_dst = np.array(y_dst)
    x_dpl = np.array(x_dpl)
    y_dpl = np.array(y_dpl)
    logger.info(
        'dialog train data, x_dst %s y_dst %s x_dpl %s y_dpl %s',
        x_dst.shape, y_dst.shape, x_dpl.shape, y_dpl.shape)
    return x_dst, y_dst, x_dpl, y_dpl