예제 #1
0
结果文件 chat_parsed.txt

2、读入chat_parsed.txt,解析每一个会话,合并连续q、a,提取订单信息等,
结果文件 session_parsed.txt
================================================================================
"""

import re
from zb.tools.logger import create_logger

from data_pre.config import BaseConf
from jddc_utils import file_op

base_conf = BaseConf()

logger = create_logger(base_conf.log_file, name='pre', cmd=True)
logger.info("Logger create success, log file is %s" % base_conf.log_file)

def _init_res_file():
    file_qaqaq = base_conf.file_qaqaq
    file_a = base_conf.file_a
    file_op.empty_file(file_qaqaq)
    file_op.empty_file(file_a)
    return file_qaqaq, file_a

# ------------------------------------------------------------------------------

def _update_nums(sess_info, line_cols):
    """统计每一个session中用户和客服的说话次数"""
    if line_cols['waiter_send'] == '0':
        sess_info['q_nums'] += 1
예제 #2
0
# coding=utf-8
from zb.tools.logger import create_logger
from zb.tools.file_tools import read_file
import os

from .utils import JiebaSeg, ApiSeg, LacSeg
from .similarity import SentenceSimilarity
from .config import BaseConf

conf = BaseConf()

logger = create_logger(conf.log_file, name='tfidf', cmd=conf.cmd_log)

if conf.refresh_model:
    logger.info("refresh model, prepare to delete old model file.")
    pkl_file = os.path.join(conf.model_path, "corpus_and_dictionary.pkl")
    file_model = os.path.join(conf.model_path, "tfidf.model")
    file_index1 = os.path.join(conf.model_path, 'index.index')
    file_index2 = os.path.join(conf.model_path, 'index.index.index.npy')
    if os.path.exists(pkl_file):
        os.remove(pkl_file)
    if os.path.exists(file_model):
        os.remove(file_model)
    if os.path.exists(file_index1):
        os.remove(file_index1)
    if os.path.exists(file_index2):
        os.remove(file_index2)
    logger.info("old model file deleted.")


def run_prediction(input_file_path, output_file_path):
예제 #3
0
from zb.tools.logger import create_logger
from jddc_utils import file_op

# 配置参数
# ------------------------------------------------------------------
from data_pre.config import BaseConf
base_conf = BaseConf()

conf = {
    "chat": base_conf.file_chat,
    "chat_pred": base_conf.file_chat_pred,
    "log_file": base_conf.log_file,
    "cmd_log": base_conf.cmd_log
}

logger = create_logger(conf['log_file'], name='chat_pre', cmd=conf['cmd_log'])

# ------------------------------------------------------------------


def transform_text(text):
    """特殊字符转换"""
    str_tf = {
        "#E-s[数字x]": "微笑",
        "#E-j[数字x]": "愤怒",
        " ": " ",
        "[数字x]%": "比例",
        "[金额x]%": "比例",
        "%": " ",
        "#": " ",
        "&": " ",
예제 #4
0
def run_prediction(input_file_path, output_file_path):
    log_file = os.path.join(base_conf.log_path, 'prediction.log')
    logger = create_logger(log_file, name='predictor', cmd=True)
    logger.info('run prediction ...')
    test_conf = TestConf()
    with tf.Session() as sess:
        # Create model structure and load parameters
        model = create_model(sess, True, model_path=test_conf.model)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(test_conf.work_path,
                                      "vocab%d.enc" % test_conf.enc_vocab_size)
        dec_vocab_path = os.path.join(test_conf.work_path,
                                      "vocab%d.dec" % test_conf.dec_vocab_size)

        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

        # Decode from standard input.
        test_path = input_file_path
        result_path = output_file_path
        empty_file(result_path)
        with codecs.open(test_path, mode='r', encoding='utf-8') as rf:
            with codecs.open(result_path, mode='a', encoding='utf-8') as wf:
                try:
                    sentence = rf.readline()
                    while sentence:
                        sentence = sentence.rstrip('<s>')
                        # Get token-ids for the input sentence.
                        logger.info("current sentence: " + sentence)
                        token_ids = data_utils.sentence_to_token_ids(
                            sentence, enc_vocab)
                        logger.info("token_ids: " +
                                    ' '.join([str(i) for i in token_ids]))
                        # Which bucket does it belong to?
                        bucket_id = min([
                            b for b in range(len(_buckets))
                            if _buckets[b][0] > len(token_ids)
                        ])
                        logger.info('bucket_id: ' + str(bucket_id))
                        # Get a 1-element batch to feed the sentence to the model.
                        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                            {bucket_id: [(token_ids, [])]}, bucket_id)
                        """logits可以理解成未进入softmax的概率,一般是输出层的输出,softmax的输入"""
                        # Get output logits for the sentence.
                        _, _, output_logits = model.step(
                            sess, encoder_inputs, decoder_inputs,
                            target_weights, bucket_id, True)
                        # This is a greedy decoder - outputs are just argmaxes of output_logits.
                        outputs = [
                            int(np.argmax(logit, axis=1))
                            for logit in output_logits
                        ]
                        # If there is an EOS symbol in outputs, cut them at that point.
                        if data_utils.EOS_ID in outputs:
                            outputs = outputs[:outputs.index(data_utils.EOS_ID
                                                             )]
                        # Print out French sentence corresponding to outputs.(corresponding to:与...一致...)
                        result = "".join([
                            tf.compat.as_str(rev_dec_vocab[output])
                            for output in outputs if tf.compat.as_str(
                                rev_dec_vocab[output]) not in [",", "_UNK"]
                        ])
                        wf.write(result + '\n')
                        logger.info("result: " + result)
                        sentence = rf.readline()
                except Exception as e:
                    traceback.print_exc()
                    logging.error("run prediction fail:", e)
    logger.info('run prediction end!')
예제 #5
0
def train():
    train_conf = TrainConf()
    log_file = os.path.join(train_conf.log_path, 'train.log')
    logger = create_logger(log_file, name='Train', cmd=True)
    logger.info("start training")

    # prepare dataset
    logger.info("prepare dataset ...")
    enc_train, dec_train, enc_dev, dec_dev, _, _ = data_utils.prepare_custom_data(
        train_conf.work_path, train_conf.train_enc, train_conf.train_dec,
        train_conf.dev_enc, train_conf.dev_dec, train_conf.enc_vocab_size,
        train_conf.dec_vocab_size)
    logger.info("dataset prepared!")

    logger.info("enc_train: %s; dec_train: %s;" % (enc_train, dec_train))
    logger.info("enc_dev: %s; dec_dev: %s;" % (enc_dev, dec_dev))

    # setup config to use BFC allocator
    config = tf.ConfigProto(
        device_count={"CPU": 8},
        inter_op_parallelism_threads=1,
        intra_op_parallelism_threads=1,
    )
    config.gpu_options.allocator_type = 'BFC'

    with tf.Session(config=config) as sess:
        # Create model.
        logger.info("Creating %d layers of %d units." %
                    (train_conf.num_layers, train_conf.layer_size))
        model = create_model(sess, False)

        logger.info("Read data into buckets and compute their sizes.")
        dev_set = read_data(enc_dev, dec_dev)
        train_set = read_data(enc_train, dec_train,
                              train_conf.max_train_data_size)
        train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while True:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_set, bucket_id)
            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False)
            step_time += (time.time() -
                          start_time) / train_conf.steps_per_checkpoint
            loss += step_loss / train_conf.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % train_conf.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                logger.info(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss. save the model!!
                checkpoint_path = os.path.join(train_conf.work_path,
                                               "seq2seq.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in range(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        logger.info("  eval: empty bucket %d" % bucket_id)
                        continue
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        dev_set, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                    eval_ppx = math.exp(
                        eval_loss) if eval_loss < 300 else float('inf')
                    logger.info("  eval: bucket %d perplexity %.2f" %
                                (bucket_id, eval_ppx))
예제 #6
0
import numpy as np
import tensorflow as tf
import codecs
import logging
import traceback
from zb.tools.logger import create_logger

from seq2seq import data_utils
from seq2seq.seq2seq_model import Seq2SeqModel
from seq2seq.config import BaseConf, TestConf, TrainConf

base_conf = BaseConf()

log_file = os.path.join(base_conf.log_path, 'execute.log')
logger = create_logger(log_file, name='exec', cmd=True)

# We use a number of buckets and pad to the closest one for efficiency.
# See seq2seq_model.Seq2SeqModel for details of how they work.
# (source_size, target_size)
_buckets = [(50, 50), (80, 60), (150, 70), (500, 90)]


def empty_file(file):
    with codecs.open(file, mode='w', encoding='utf-8') as f:
        f.truncate()


def read_data(source_path, target_path, max_size=None):
    """Read data from source and target files and put into buckets.
예제 #7
0
=============================================================================
"""

import requests
import grequests
from zb.tools.logger import create_logger

from jddc_utils import file_op

# 配置参数
# ------------------------------------------------------------------------------

from data_pre.config import BaseConf
conf = BaseConf()

logger = create_logger(conf.log_file, name="tokenize", cmd=conf.cmd_log)

def get_text_tokenize(text):
    """调用京东的分词器API"""
    token = conf.api_token
    inner = conf.inner
    headers = {
        "User-Agent": 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 '
                      'Mobile Safari/537.36',
    }
    url_outer = "http://jdialog-lexeme.jd.com/lexeme?token={token}&text={text}"
    url_inner = "http://jdialog-lexeme-stage.jd.com/lexeme?token={token}&text={text}"

    if inner:
        url = url_inner.format(token=token, text=text)
예제 #8
0
class Config(object):
    # base path
    base_path = "/submitwork/data"
    file_stopwords = os.path.join(base_path, 'stopwords.txt')

    # data path
    data_path = os.path.join(base_path, 'prepared')
    # 在这里切换QA数据集,不同数据集对得分的影响较大
    file_questions = os.path.join(data_path, 'questions.txt')
    file_answers = os.path.join(data_path, 'answers.txt')

    # conf for results
    res_path = os.path.join(base_path, 'BM25')
    insure_folder_exists(res_path)
    file_questions_segs = os.path.join(res_path, 'questions_segs.txt')
    log_file = os.path.join(res_path, 'bm25_implement.log')

    # other
    top = 5
    cmd_log = True
    for_search = True
    api_token = '9fb9785b4ea044e5871a8cbdae354e03'
    inner = False


# ------------------------------------------------------------------------------------------------------

conf = Config()
logger = create_logger(name='bm25', log_file=conf.log_file, cmd=conf.cmd_log)