Exemplo n.º 1
0
def process_data(domain: Text, locale: Text) -> None:
    global utterance
    global intent
    #clear the list before loading
    utterance.clear()
    intent.clear()
    try:
        file = codecs.open(
            os.path.join(datapath, domain + '_' + locale + ".md"), 'r',
            'utf-8')
        lines = file.read().split("\n")
        log_util.loginfomsg(
            f"[MARKDOWN] recieved data, total lines: {len(lines)}")
        for line in lines:
            line = line.strip()
            header = find_section_header(line)
            if header:
                set_current_section(header[0], header[1])
            else:
                parse_item(line)
        return utterance, intent
    except FileNotFoundError:
        log_util.logerrormsg(
            f"[MARKDOWN] no file found for given domain {domain}, ensure that data is given in format .md."
        )
        return json.loads(
            '{"response":f"ERROR: no file found for given domain {domain}, ensure that data is given in format .md."}'
        )
        raise ValueError(
            f"no file found for given domain {domain}, ensure that data is given in format .md."
        )
Exemplo n.º 2
0
def predict(domain, locale, userUtterance):
    modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu')
    global dataFile
    dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents', domain + '_' + locale + '.md')
    MODEL_NAME = domain + '_' + locale
    interpreter = Interpreter.load(os.path.join(modelFile, MODEL_NAME))
    data = interpreter.parse(userUtterance)
    intent_, score_, utterance_ = [], [], []
    intent_.append(data['intent_ranking'][0]['name'])
    intent_.append(data['intent_ranking'][1]['name'])
    intent_.append(data['intent_ranking'][2]['name'])
    score_.append("{:.2f}".format(data['intent_ranking'][0]['confidence']))
    score_.append("{:.2f}".format(data['intent_ranking'][1]['confidence']))
    score_.append("{:.2f}".format(data['intent_ranking'][2]['confidence']))
    utterance_.append(getUtterance(intent_[0]))
    utterance_.append(getUtterance(intent_[1]))
    utterance_.append(getUtterance(intent_[2]))
    entities_ = data['entities']
    text_ = data['text']
    intent_ranking_ = [{
        "name": p,
        "confidence": q,
        "utterance": r
    } for p, q, r in zip(intent_, score_, utterance_)]
    intent_top_ = {"name": intent_[0], "confidence": score_[0]}
    # build JSON response
    response = {}
    response['intent'] = intent_top_
    response['entities'] = entities_
    response['intent_ranking'] = intent_ranking_
    response['text'] = text_
    log_util.loginfomsg(f"[PREDICT_NLU] prediction: {response}")
    result = str(response).replace("'", '"').strip()
    return result
Exemplo n.º 3
0
def predict(domain, locale, userUtterance):
    if locale == 'en':
        utter = re.sub(r'[^a-zA-Z ]', '', userUtterance)
    else:
        utter = userUtterance

    combinations = classifier.genUtterances(utter, locale)
    response = classifier.processUtterance(combinations, domain, locale)
    log_util.loginfomsg(f'[PREDICT_TFIDF]: {response}')
    result = str(response).replace("'", '"').strip()
    return result
Exemplo n.º 4
0
def getParameter(param):
    global properties
    res = ""
    if param in properties:
        res = properties[param]
        return res
    else:
        log_util.loginfomsg(
            '[NLP_CONFIG] the required parameter could not be located'.format(
                param))
        return res
Exemplo n.º 5
0
def sendMessgae(topicName, key, value):
    global producer
    pNum = utils.getPartition(key, int(nlp_config.getParameter('PARTITIONS')))
    msg = json.loads(value)
    producer.send(topicName,
                  value=msg,
                  key=key.encode('utf-8'),
                  partition=pNum)
    producer.flush()
    log_util.loginfomsg("[PRODUCER] sending message: \"{}\"".format(value))
    log_util.loginfomsg(
        "[PRODUCER] message sent with key: \"{}\" to partition \"{}\"!".format(
            key, pNum))
Exemplo n.º 6
0
def train(domain, locale, prop):
    datapath = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents')
    vectorDimension = int(nlp_config.getParameter('VECTOR_DIMENSION'))
    iterationNumbers = int(nlp_config.getParameter('ITERATION_NUMBER'))
    format = nlp_config.getParameter('FORMAT')

    utterance = []
    intent = []

    if format == 'md':
        utterance, intent = process_data(domain, locale)
        if not utterance or not intent:
            log_util.logerrormsg(
                "[TRAIN_TFIDF] could not parse the markdown data. Exiting...")
            res = {"intents": "-1", "utterances": "-1"}
            response = str(res).replace("'", '"').strip()
            return response
    elif format == 'json':
        fileData = os.path.join(scriptDir, datapath,
                                domain + '_' + locale + '.json')
        with codecs.open(fileData, 'r', 'utf-8') as dataFile:
            data = json.load(dataFile)
        for nameUtterances in data['tasks']:
            for utt in nameUtterances['utterances']:
                utterance.append(utt)
                intent.append(nameUtterances['name'])
    else:
        log_util.logerrormsg("unsupported format. Exiting...")
        res = {"intents": "-1", "utterances": "-1"}
        response = str(res).replace("'", '"').strip()
        return response

    mIntent = set(intent)

    # check if any changes to config
    if nlp_config.is_config_stale(domain, locale, prop):
        log_util.loginfomsg(
            "[TRAIN_TFIDF] no changes found to training data, using pre-trained model"
        )
        res = {"intents": str(len(mIntent)), "utterances": str(len(intent))}
        response = str(res).replace("'", '"').strip()  # make it a string
        return response
    else:
        pass

    stopListFile = os.path.join(scriptDir, '..', '..', 'dictionary',
                                'stopwords_' + locale + '.txt')
    arrayWords = []
    stopWords = []

    f = codecs.open(stopListFile, 'r', 'utf-8')
    lines = f.read().split("\n")
    for line in lines:
        if line != "":
            arrayWords.append(line.split(','))

    for a_word in arrayWords:
        for s_word in a_word:
            if (re.sub(' ', '', s_word)) != "":
                stopWords.append(s_word)

    extraStopWords = set(stopWords)
    if locale == 'ar':
        stops = set(stopwords.words('arabic')) | extraStopWords
    elif locale == 'da':
        stops = set(stopwords.words('danish')) | extraStopWords
    elif locale == 'en':
        stops = set(stopwords.words('english')) | extraStopWords
    elif locale == 'es':
        stops = set(stopwords.words('spanish')) | extraStopWords
    elif locale == 'hi':
        stops = extraStopWords
    elif locale == 'mr':
        stops = extraStopWords
    elif locale == 'nl':
        stops = set(stopwords.words('dutch')) | extraStopWords
    elif locale == 'sv':
        stops = set(stopwords.words('swedish')) | extraStopWords
    else:
        res = {"intents": "0", "utterances": "0"}
        response = str(res).replace("'", '"').strip()
        return response

    stemmer.setLocale(locale)

    tfidfVec = TfidfVectorizer(utterance,
                               decode_error='ignore',
                               stop_words=stops,
                               ngram_range=(1, 5),
                               tokenizer=stemmer.stemTokenize)
    trainsetIdfVectorizer = tfidfVec.fit_transform(utterance).toarray()
    vLength = len(trainsetIdfVectorizer[1])
    nDimension = vectorDimension
    if vLength <= vectorDimension:
        nDimension = vLength - 1

    svd = TruncatedSVD(n_components=nDimension,
                       algorithm='randomized',
                       n_iter=iterationNumbers,
                       random_state=42)
    trainLSA = svd.fit_transform(trainsetIdfVectorizer)

    pickle_path = os.path.join(scriptDir, '..', '..', 'models', 'tfidf',
                               domain + '_' + locale + '_')
    fileName = pickle_path + 'utterance.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(utterance, fileObject)
    fileObject.close()
    fileName = pickle_path + 'intent.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(intent, fileObject)
    fileObject.close()
    fileName = pickle_path + 'tfidfVec.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(tfidfVec, fileObject)
    fileObject.close()
    fileName = pickle_path + 'svd.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(svd, fileObject)
    fileObject.close()
    fileName = pickle_path + 'trainLSA.m'
    fileObject = open(fileName, 'wb')
    pickle.dump(trainLSA, fileObject)
    fileObject.close()

    log_util.loginfomsg(f'[TRAIN_TFIDF] identified domain: {domain}')
    log_util.loginfomsg(f'[TRAIN_TFIDF] identified locale: {locale}')
    log_util.loginfomsg(
        f'[TRAIN_TFIDF] number of utterances for training: {len(intent)}')
    log_util.loginfomsg(
        f'[TRAIN_TFIDF] number of intents for training: {len(mIntent)}')

    res = {
        "intents": str(len(mIntent)),
        "utterances": str(len(intent)),
        "model": "TFIDF"
    }
    response = str(res).replace("'", '"').strip()  # make it a string
    return response
Exemplo n.º 7
0
def readMessages(topic):
    global consumer
    consumer.subscribe(topic)
    for msg in consumer:
        log_util.loginfomsg(msg)
Exemplo n.º 8
0
def train(domain, locale, prop):
    format = nlp_config.getParameter('FORMAT')
    dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData',
                            'intents', domain + '_' + locale + '.' + format)
    configFile = os.path.join(scriptDir, '..', '..', 'config',
                              nlp_config.getParameter('CONFIG_FILE'))
    modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu')
    MODEL_NAME = domain + '_' + locale

    try:
        if format == 'md' or format == 'json' or format == 'yml':
            training_data = load_data(dataFile)
            trainer = Trainer(config.load(configFile))

            if not nlp_config.is_config_stale(domain, locale, prop):
                trainer.train(training_data)
                #delete the folder if it exist
                if os.path.exists(modelFile + MODEL_NAME):
                    shutil.rmtree(modelFile + MODEL_NAME)
                trainer.persist(modelFile, fixed_model_name=MODEL_NAME)
            else:
                log_util.loginfomsg(
                    "[TRAIN_NLU] no changes found to training data, using pre-trained model"
                )
        else:
            log_util.logerrormsg("[TRAIN_NLU] unsupported format. Exiting...")
            res = {"intents": "-1", "utterances": "-1"}
            response = str(res).replace("'", '"').strip()
            return response
    except FileNotFoundError:
        log_util.logerrormsg(
            "[TRAIN_NLU] could not locate the NLU config file")
        res = {"intents": "-1", "utterances": "-1"}
        response = str(res).replace("'", '"').strip()
        return response

    training_examples = OrderedDict()
    INTENT = 'intent'
    for example in [e.as_dict_nlu() for e in training_data.training_examples]:
        intent = example[INTENT]
        training_examples.setdefault(intent, [])
        training_examples[intent].append(example)
    count = 0
    for x in training_examples:
        if isinstance(training_examples[x], list):
            count += len(training_examples[x])

    log_util.loginfomsg(f'[TRAIN_NLU] identified domain: {domain}')
    log_util.loginfomsg(f'[TRAIN_NLU] identified locale: {locale}')
    log_util.loginfomsg(
        f'[TRAIN_NLU] number of utterances for training: {count}')
    log_util.loginfomsg(
        f'[TRAIN_NLU] number of intents for training: {len(training_examples)}'
    )

    algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0]
    algo = algo.split("_")[1].upper()
    model = 'NLU:' + algo

    res = {
        "intents": str(len(training_examples)),
        "utterances": str(count),
        "model": model
    }
    response = str(res).replace("'", '"').strip()  # make it a string
    return response
Exemplo n.º 9
0
from utils import log_util
from core import train_model, predict_model
from pubsub import consumer
from pubsub import processMessage
# ignore all warnings
from pubsub import producer as pr

scriptDir = os.path.dirname(__file__)

simplefilter(action='ignore')

# load all the config parameters
nlp_config.loadParameters()

if re.search(nlp_config.getParameter('USE_BROKER'), 'true', re.IGNORECASE):
    log_util.loginfomsg("[APP] broker based NLPEngine enabled")
    # initialise the producer
    pr.initialise()
    # Run consumer listener to process all the NLP_TO_BOT messages
    consumer_ = consumer.initialise(
        nlp_config.getParameter('TOPIC_BOT_TO_NLP'))
    for msg in consumer_:
        log_util.loginfomsg(msg)
        t = threading.Thread(target=processMessage.process, args=(msg, ))
        t.start()
else:
    log_util.loginfomsg("[APP] REST API based NLPEngine enabled")
    app = flask.Flask(__name__)
    scriptDir = os.path.dirname(__file__)
    SERVER_HOST = '0.0.0.0'
    SERVER_PORT = nlp_config.getParameter('PORT')
Exemplo n.º 10
0
def process(message):
    log_util.loginfomsg('[PROCESS_MESSAGE]: message received with key: ' +
                        message.key.decode('utf-8') + ' message: ' +
                        str(message.value))
    key = message.key.decode('utf-8')
    # check if the message is for training the NLP
    if utils.parseKey(key) == 0 and key.find('DUMMY') != -1:
        if 'messageId' in message.value and message.value[
                'messageId'] == 'TRAIN':
            domain = message.value['domain']
            locale = message.value['locale']
            log_util.loginfomsg(
                '[INTENT_ENGINE] training the NLP for domain:{} and locale:{}'.
                format(domain, locale))
            res = train_model.train(domain, locale)
            n = int(json.loads(res)["utterances"])
            if n > 0:
                res = {
                    "messageId": "TRAIN_SUCCESS",
                    "domain": domain,
                    "locale": locale,
                    "message": res
                }
            else:
                res = {
                    "messageId": "TRAIN_FAIL",
                    "domain": domain,
                    "locale": locale,
                    "message": res
                }
            producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'),
                                 key, json.dumps(res))
        elif 'messageId' in message.value and message.value[
                'messageId'] == 'PREDICT':
            domain = message.value['domain']
            locale = message.value['locale']
            utterance = message.value['userUtterance']
            log_util.loginfomsg(
                '[PROCESS_MESSAGE] predicting the utterance:{} for domain:{} and locale:{}'
                .format(utterance, domain, locale))
            result = predict_model.predict(domain, locale, utterance)
            res = {
                "messageId": "PREDICT",
                "domain": domain,
                "locale": locale,
                "userUtterance": utterance,
                "message": result
            }
            producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'),
                                 key, json.dumps(res))
    else:
        domain = message.value['domain']
        locale = message.value['locale']
        utterance = message.value['userUtterance']
        log_util.loginfomsg(
            '[PROCESS_MESSAGE] processing the utterance:{} for domain:{} and locale:{}'
            .format(utterance, domain, locale))
        result = predict_model.predict(domain, locale, utterance)
        res = {
            "messageId": "PREDICT",
            "domain": domain,
            "locale": locale,
            "userUtterance": utterance,
            "message": result
        }
        producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key,
                             json.dumps(res))