예제 #1
0
def predict(domain, locale, userUtterance):

    response = json.loads('{"response":"ERROR: error during predicting the user utterance"}')

    if not nlp_config.checkDataAvaialble:
        log_util.logerrormsg("[PREDICT_MODEL] no intent data found. Exiting...")
        return json.loads('{"response":"ERROR: no intent data found. Exiting..."}')

    if nlp_config.getParameter('ALGORITHM') == 'TFIDF':
        response = predict_tfidf.predict(domain, locale, userUtterance)
    elif nlp_config.getParameter('ALGORITHM') == 'NLU':
        response = predict_nlu.predict(domain, locale, userUtterance)
    else:
        log_util.logerrormsg("[PREDICT_MODEL] configured algorithm is not supported. Exiting...")
    return response
예제 #2
0
def train(domain, locale):
    response = json.loads(
        '{"response":"ERROR: Error during training the data"}')

    if not nlp_config.checkDataAvaialble:
        log_util.logerrormsg("[TRAIN_MODEL] no intent data found. Exiting...")
        return response

    if nlp_config.getParameter('ALGORITHM') == 'TFIDF':
        response = train_tfidf.train(domain, locale,
                                     nlp_config.getProperties())
    elif nlp_config.getParameter('ALGORITHM') == 'NLU':
        response = train_nlu.train(domain, locale, nlp_config.getProperties())
    else:
        log_util.logerrormsg(
            "[TRAIN_MODEL] configured algorithm is not supported. Exiting...")
    return response
예제 #3
0
def sendMessgae(topicName, key, value):
    global producer
    pNum = utils.getPartition(key, int(nlp_config.getParameter('PARTITIONS')))
    msg = json.loads(value)
    producer.send(topicName,
                  value=msg,
                  key=key.encode('utf-8'),
                  partition=pNum)
    producer.flush()
    log_util.loginfomsg("[PRODUCER] sending message: \"{}\"".format(value))
    log_util.loginfomsg(
        "[PRODUCER] message sent with key: \"{}\" to partition \"{}\"!".format(
            key, pNum))
예제 #4
0
    def trainDomain():
        if not (request.args.get('domain')):
            log_util.logerrormsg("[APP] missing domain parameter")
            abort(404)
        if request.args.get('locale'):
            locale = request.args.get('locale')
        else:
            locale = 'en'
        domain = request.args.get('domain')
        res = train_model.train(domain, locale)
        n = int(json.loads(res)["utterances"])

        if (nlp_config.getParameter('ALGORITHM') == 'TFIDF'):
            md = 'TFIDF'
        else:
            algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0]
            algo = algo.split("_")[1].upper()
            md = 'NLU:' + algo

        if n > 0:
            response = {
                "messageId": "TRAIN_SUCCESS",
                "domain": domain,
                "locale": locale,
                "message": res,
                "model": md
            }
        else:
            response = {
                "messageId": "TRAIN_FAIL",
                "domain": domain,
                "locale": locale,
                "message": res,
                "model": md
            }

        return make_response(
            jsonify(response), 200,
            {'Content-Type': 'application/json; charset=utf-8'})
예제 #5
0
def train(domain, locale, prop):
    datapath = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents')
    vectorDimension = int(nlp_config.getParameter('VECTOR_DIMENSION'))
    iterationNumbers = int(nlp_config.getParameter('ITERATION_NUMBER'))
    format = nlp_config.getParameter('FORMAT')

    utterance = []
    intent = []

    if format == 'md':
        utterance, intent = process_data(domain, locale)
        if not utterance or not intent:
            log_util.logerrormsg(
                "[TRAIN_TFIDF] could not parse the markdown data. Exiting...")
            res = {"intents": "-1", "utterances": "-1"}
            response = str(res).replace("'", '"').strip()
            return response
    elif format == 'json':
        fileData = os.path.join(scriptDir, datapath,
                                domain + '_' + locale + '.json')
        with codecs.open(fileData, 'r', 'utf-8') as dataFile:
            data = json.load(dataFile)
        for nameUtterances in data['tasks']:
            for utt in nameUtterances['utterances']:
                utterance.append(utt)
                intent.append(nameUtterances['name'])
    else:
        log_util.logerrormsg("unsupported format. Exiting...")
        res = {"intents": "-1", "utterances": "-1"}
        response = str(res).replace("'", '"').strip()
        return response

    mIntent = set(intent)

    # check if any changes to config
    if nlp_config.is_config_stale(domain, locale, prop):
        log_util.loginfomsg(
            "[TRAIN_TFIDF] no changes found to training data, using pre-trained model"
        )
        res = {"intents": str(len(mIntent)), "utterances": str(len(intent))}
        response = str(res).replace("'", '"').strip()  # make it a string
        return response
    else:
        pass

    stopListFile = os.path.join(scriptDir, '..', '..', 'dictionary',
                                'stopwords_' + locale + '.txt')
    arrayWords = []
    stopWords = []

    f = codecs.open(stopListFile, 'r', 'utf-8')
    lines = f.read().split("\n")
    for line in lines:
        if line != "":
            arrayWords.append(line.split(','))

    for a_word in arrayWords:
        for s_word in a_word:
            if (re.sub(' ', '', s_word)) != "":
                stopWords.append(s_word)

    extraStopWords = set(stopWords)
    if locale == 'ar':
        stops = set(stopwords.words('arabic')) | extraStopWords
    elif locale == 'da':
        stops = set(stopwords.words('danish')) | extraStopWords
    elif locale == 'en':
        stops = set(stopwords.words('english')) | extraStopWords
    elif locale == 'es':
        stops = set(stopwords.words('spanish')) | extraStopWords
    elif locale == 'hi':
        stops = extraStopWords
    elif locale == 'mr':
        stops = extraStopWords
    elif locale == 'nl':
        stops = set(stopwords.words('dutch')) | extraStopWords
    elif locale == 'sv':
        stops = set(stopwords.words('swedish')) | extraStopWords
    else:
        res = {"intents": "0", "utterances": "0"}
        response = str(res).replace("'", '"').strip()
        return response

    stemmer.setLocale(locale)

    tfidfVec = TfidfVectorizer(utterance,
                               decode_error='ignore',
                               stop_words=stops,
                               ngram_range=(1, 5),
                               tokenizer=stemmer.stemTokenize)
    trainsetIdfVectorizer = tfidfVec.fit_transform(utterance).toarray()
    vLength = len(trainsetIdfVectorizer[1])
    nDimension = vectorDimension
    if vLength <= vectorDimension:
        nDimension = vLength - 1

    svd = TruncatedSVD(n_components=nDimension,
                       algorithm='randomized',
                       n_iter=iterationNumbers,
                       random_state=42)
    trainLSA = svd.fit_transform(trainsetIdfVectorizer)

    pickle_path = os.path.join(scriptDir, '..', '..', 'models', 'tfidf',
                               domain + '_' + locale + '_')
    fileName = pickle_path + 'utterance.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(utterance, fileObject)
    fileObject.close()
    fileName = pickle_path + 'intent.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(intent, fileObject)
    fileObject.close()
    fileName = pickle_path + 'tfidfVec.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(tfidfVec, fileObject)
    fileObject.close()
    fileName = pickle_path + 'svd.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(svd, fileObject)
    fileObject.close()
    fileName = pickle_path + 'trainLSA.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(trainLSA, fileObject)
    fileObject.close()

    log_util.loginfomsg(f'[TRAIN_TFIDF] identified domain: {domain}')
    log_util.loginfomsg(f'[TRAIN_TFIDF] identified locale: {locale}')
    log_util.loginfomsg(
        f'[TRAIN_TFIDF] number of utterances for training: {len(intent)}')
    log_util.loginfomsg(
        f'[TRAIN_TFIDF] number of intents for training: {len(mIntent)}')

    res = {
        "intents": str(len(mIntent)),
        "utterances": str(len(intent)),
        "model": "TFIDF"
    }
    response = str(res).replace("'", '"').strip()  # make it a string
    return response
예제 #6
0
def train(domain, locale, prop):
    format = nlp_config.getParameter('FORMAT')
    dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents', domain + '_' + locale + '.' + format)
    configFile = os.path.join(scriptDir, '..', '..', 'config',
                              nlp_config.getParameter('CONFIG_FILE'))
    modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu')
    MODEL_NAME = domain + '_' + locale

    try:
        if format == 'md' or format == 'json' or format == 'yml':
            training_data = load_data(dataFile)
            trainer = Trainer(config.load(configFile))

            if not nlp_config.is_config_stale(domain, locale, prop):
                trainer.train(training_data)
                #delete the folder if it exist
                if os.path.exists(modelFile + MODEL_NAME):
                    shutil.rmtree(modelFile + MODEL_NAME)
                trainer.persist(modelFile, fixed_model_name=MODEL_NAME)
            else:
                log_util.loginfomsg(
                    "[TRAIN_NLU] no changes found to training data, using pre-trained model"
                )
        else:
            log_util.logerrormsg("[TRAIN_NLU] unsupported format. Exiting...")
            res = {"intents": "-1", "utterances": "-1"}
            response = str(res).replace("'", '"').strip()
            return response
    except FileNotFoundError:
        log_util.logerrormsg(
            "[TRAIN_NLU] could not locate the NLU config file")
        res = {"intents": "-1", "utterances": "-1"}
        response = str(res).replace("'", '"').strip()
        return response

    training_examples = OrderedDict()
    INTENT = 'intent'
    for example in [e.as_dict_nlu() for e in training_data.training_examples]:
        intent = example[INTENT]
        training_examples.setdefault(intent, [])
        training_examples[intent].append(example)
    count = 0
    for x in training_examples:
        if isinstance(training_examples[x], list):
            count += len(training_examples[x])

    log_util.loginfomsg(f'[TRAIN_NLU] identified domain: {domain}')
    log_util.loginfomsg(f'[TRAIN_NLU] identified locale: {locale}')
    log_util.loginfomsg(
        f'[TRAIN_NLU] number of utterances for training: {count}')
    log_util.loginfomsg(
        f'[TRAIN_NLU] number of intents for training: {len(training_examples)}'
    )

    algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0]
    algo = algo.split("_")[1].upper()
    model = 'NLU:' + algo

    res = {
        "intents": str(len(training_examples)),
        "utterances": str(count),
        "model": model
    }
    response = str(res).replace("'", '"').strip()  # make it a string
    return response
예제 #7
0
from utils import nlp_config
from utils import log_util
from core import train_model, predict_model
from pubsub import consumer
from pubsub import processMessage
# ignore all warnings
from pubsub import producer as pr

scriptDir = os.path.dirname(__file__)

simplefilter(action='ignore')

# load all the config parameters
nlp_config.loadParameters()

if re.search(nlp_config.getParameter('USE_BROKER'), 'true', re.IGNORECASE):
    log_util.loginfomsg("[APP] broker based NLPEngine enabled")
    # initialise the producer
    pr.initialise()
    # Run consumer listener to process all the NLP_TO_BOT messages
    consumer_ = consumer.initialise(
        nlp_config.getParameter('TOPIC_BOT_TO_NLP'))
    for msg in consumer_:
        log_util.loginfomsg(msg)
        t = threading.Thread(target=processMessage.process, args=(msg, ))
        t.start()
else:
    log_util.loginfomsg("[APP] REST API based NLPEngine enabled")
    app = flask.Flask(__name__)
    scriptDir = os.path.dirname(__file__)
    SERVER_HOST = '0.0.0.0'
예제 #8
0
def process(message):
    log_util.loginfomsg('[PROCESS_MESSAGE]: message received with key: ' +
                        message.key.decode('utf-8') + ' message: ' +
                        str(message.value))
    key = message.key.decode('utf-8')
    # check if the message is for training the NLP
    if utils.parseKey(key) == 0 and key.find('DUMMY') != -1:
        if 'messageId' in message.value and message.value[
                'messageId'] == 'TRAIN':
            domain = message.value['domain']
            locale = message.value['locale']
            log_util.loginfomsg(
                '[INTENT_ENGINE] training the NLP for domain:{} and locale:{}'.
                format(domain, locale))
            res = train_model.train(domain, locale)
            n = int(json.loads(res)["utterances"])
            if n > 0:
                res = {
                    "messageId": "TRAIN_SUCCESS",
                    "domain": domain,
                    "locale": locale,
                    "message": res
                }
            else:
                res = {
                    "messageId": "TRAIN_FAIL",
                    "domain": domain,
                    "locale": locale,
                    "message": res
                }
            producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'),
                                 key, json.dumps(res))
        elif 'messageId' in message.value and message.value[
                'messageId'] == 'PREDICT':
            domain = message.value['domain']
            locale = message.value['locale']
            utterance = message.value['userUtterance']
            log_util.loginfomsg(
                '[PROCESS_MESSAGE] predicting the utterance:{} for domain:{} and locale:{}'
                .format(utterance, domain, locale))
            result = predict_model.predict(domain, locale, utterance)
            res = {
                "messageId": "PREDICT",
                "domain": domain,
                "locale": locale,
                "userUtterance": utterance,
                "message": result
            }
            producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'),
                                 key, json.dumps(res))
    else:
        domain = message.value['domain']
        locale = message.value['locale']
        utterance = message.value['userUtterance']
        log_util.loginfomsg(
            '[PROCESS_MESSAGE] processing the utterance:{} for domain:{} and locale:{}'
            .format(utterance, domain, locale))
        result = predict_model.predict(domain, locale, utterance)
        res = {
            "messageId": "PREDICT",
            "domain": domain,
            "locale": locale,
            "userUtterance": utterance,
            "message": result
        }
        producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key,
                             json.dumps(res))