Пример #1
0
def build_emb_matrix(corpus):
    corpus_ = []
    _ = map(lambda x: corpus_.extend(x), corpus)
    word2id = NlpUtil.build_word2id(corpus_)
    word2vec = NlpUtil.load_word2vec(config.word2vec_fpath)
    emb_matrix = NlpUtil.build_emb_matrix(word2vec, config.embedding_size,
                                          word2id)
    np.save(config.emb_matrix_fpath, emb_matrix)
    with codecs.open(config.word2id_fpath, 'w', 'utf-8') as out_f:
        out_f.write('\n'.join(
            ['%s\t%d' % (k, v) for k, v in word2id.iteritems()]))
    print 'Build emb_matrix done'
Пример #2
0
    def process_dialog(cls, msg, use_task=True):
        """
        Dialog strategy: use sub-task to handle dialog firstly,
        if failed, use retrieval or generational func to handle it.
        """
        # Task response.
        if use_task:
            task_response, cls.dialog_status = TaskCore.task_handle(
                msg, cls.dialog_status)
        else:
            task_response = None

        # Search response.
        if len(cls.dialog_status.context) >= 3 and ch_count(msg) <= 4:
            user_msgs = cls.dialog_status.context[::2][-3:]
            msg = "<s>".join(user_msgs)
            mode = "cr"
        else:
            mode = "qa"
        msg_tokens = NlpUtil.tokenize(msg, True)
        search_response, sim_score = SearchCore.search(msg_tokens, mode=mode)

        # Seq2seq response.
        seq2seq_response = cls._predict_via_seq2seq(msg_tokens)
        log_print("search_response=%s" % search_response)
        log_print("seq2seq_response=%s" % seq2seq_response)

        if task_response:
            response = task_response
        elif sim_score >= 1.0:
            response = search_response
        else:
            response = seq2seq_response

        return response
Пример #3
0
 def precess_line(line, is_train_data=True):
     try:
         line = line.strip()
         if is_train_data:
             line, flag = line.rsplit(',', 1)
         id_, text = line.split(',', 1)
         text = text.replace('|', ' ')
         text = text.replace('\t', ' ')
         text = '|'.join(['<s>'] + NlpUtil.tokenize(text, True) + ['</s>'])
         #text = '|'.join(NlpUtil.tokenize(text, True))
         return ('\t'.join([id_, text, flag]) +
                 '\n' if is_train_data else '\t'.join([id_, text]) + '\n')
     except Exception as e:
         print('line=%s, errmsg=%s', line, e)
def info_supply_handle(msg, dialog_status):
    last_msg = dialog_status.context[-3]
    msg_tokens = NlpUtil.tokenize(last_msg, True)
    response, _ = SearchCore.search(msg_tokens, info_supply_pattern)
    return response
Пример #5
0
def build_emb_matrix(corpus):
    corpus_ = []
    _ = map(lambda x: corpus_.extend(x), corpus)
    word2id = NlpUtil.build_word2id(corpus_)
    word2vec = NlpUtil.load_word2vec(config.word2vec_fpath)
    emb_matrix = NlpUtil.build_emb_matrix(word2vec, config.embedding_size,
                                          word2id)
    np.save(config.emb_matrix_fpath, emb_matrix)
    with codecs.open(config.word2id_fpath, 'w', 'utf-8') as out_f:
        out_f.write('\n'.join(
            ['%s\t%d' % (k, v) for k, v in word2id.iteritems()]))
    print 'Build emb_matrix done'


if __name__ == '__main__':
    # Tokenize data
    tokenize_corpus(config.raw_train_fpath,
                    config.train_fpath,
                    is_train_data=True)
    tokenize_corpus(config.raw_predict_fpath,
                    config.predict_fpath,
                    is_train_data=False)
    corpus = _get_corpus()

    # Train word2vec
    NlpUtil.train_word2vec(corpus, './model/word2vec')

    # Build emb matrix
    build_emb_matrix(corpus)