def process_data(domain: Text, locale: Text) -> None: global utterance global intent #clear the list before loading utterance.clear() intent.clear() try: file = codecs.open( os.path.join(datapath, domain + '_' + locale + ".md"), 'r', 'utf-8') lines = file.read().split("\n") log_util.loginfomsg( f"[MARKDOWN] recieved data, total lines: {len(lines)}") for line in lines: line = line.strip() header = find_section_header(line) if header: set_current_section(header[0], header[1]) else: parse_item(line) return utterance, intent except FileNotFoundError: log_util.logerrormsg( f"[MARKDOWN] no file found for given domain {domain}, ensure that data is given in format .md." ) return json.loads( '{"response":f"ERROR: no file found for given domain {domain}, ensure that data is given in format .md."}' ) raise ValueError( f"no file found for given domain {domain}, ensure that data is given in format .md." )
def predict(domain, locale, userUtterance): modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu') global dataFile dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents', domain + '_' + locale + '.md') MODEL_NAME = domain + '_' + locale interpreter = Interpreter.load(os.path.join(modelFile, MODEL_NAME)) data = interpreter.parse(userUtterance) intent_, score_, utterance_ = [], [], [] intent_.append(data['intent_ranking'][0]['name']) intent_.append(data['intent_ranking'][1]['name']) intent_.append(data['intent_ranking'][2]['name']) score_.append("{:.2f}".format(data['intent_ranking'][0]['confidence'])) score_.append("{:.2f}".format(data['intent_ranking'][1]['confidence'])) score_.append("{:.2f}".format(data['intent_ranking'][2]['confidence'])) utterance_.append(getUtterance(intent_[0])) utterance_.append(getUtterance(intent_[1])) utterance_.append(getUtterance(intent_[2])) entities_ = data['entities'] text_ = data['text'] intent_ranking_ = [{ "name": p, "confidence": q, "utterance": r } for p, q, r in zip(intent_, score_, utterance_)] intent_top_ = {"name": intent_[0], "confidence": score_[0]} # build JSON response response = {} response['intent'] = intent_top_ response['entities'] = entities_ response['intent_ranking'] = intent_ranking_ response['text'] = text_ log_util.loginfomsg(f"[PREDICT_NLU] prediction: {response}") result = str(response).replace("'", '"').strip() return result
def predict(domain, locale, userUtterance): if locale == 'en': utter = re.sub(r'[^a-zA-Z ]', '', userUtterance) else: utter = userUtterance combinations = classifier.genUtterances(utter, locale) response = classifier.processUtterance(combinations, domain, locale) log_util.loginfomsg(f'[PREDICT_TFIDF]: {response}') result = str(response).replace("'", '"').strip() return result
def getParameter(param): global properties res = "" if param in properties: res = properties[param] return res else: log_util.loginfomsg( '[NLP_CONFIG] the required parameter could not be located'.format( param)) return res
def sendMessgae(topicName, key, value): global producer pNum = utils.getPartition(key, int(nlp_config.getParameter('PARTITIONS'))) msg = json.loads(value) producer.send(topicName, value=msg, key=key.encode('utf-8'), partition=pNum) producer.flush() log_util.loginfomsg("[PRODUCER] sending message: \"{}\"".format(value)) log_util.loginfomsg( "[PRODUCER] message sent with key: \"{}\" to partition \"{}\"!".format( key, pNum))
def train(domain, locale, prop): datapath = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents') vectorDimension = int(nlp_config.getParameter('VECTOR_DIMENSION')) iterationNumbers = int(nlp_config.getParameter('ITERATION_NUMBER')) format = nlp_config.getParameter('FORMAT') utterance = [] intent = [] if format == 'md': utterance, intent = process_data(domain, locale) if not utterance or not intent: log_util.logerrormsg( "[TRAIN_TFIDF] could not parse the markdown data. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response elif format == 'json': fileData = os.path.join(scriptDir, datapath, domain + '_' + locale + '.json') with codecs.open(fileData, 'r', 'utf-8') as dataFile: data = json.load(dataFile) for nameUtterances in data['tasks']: for utt in nameUtterances['utterances']: utterance.append(utt) intent.append(nameUtterances['name']) else: log_util.logerrormsg("unsupported format. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response mIntent = set(intent) # check if any changes to config if nlp_config.is_config_stale(domain, locale, prop): log_util.loginfomsg( "[TRAIN_TFIDF] no changes found to training data, using pre-trained model" ) res = {"intents": str(len(mIntent)), "utterances": str(len(intent))} response = str(res).replace("'", '"').strip() # make it a string return response else: pass stopListFile = os.path.join(scriptDir, '..', '..', 'dictionary', 'stopwords_' + locale + '.txt') arrayWords = [] stopWords = [] f = codecs.open(stopListFile, 'r', 'utf-8') lines = f.read().split("\n") for line in lines: if line != "": arrayWords.append(line.split(',')) for a_word in arrayWords: for s_word in a_word: if (re.sub(' ', '', s_word)) != "": stopWords.append(s_word) extraStopWords = set(stopWords) if locale == 'ar': stops = set(stopwords.words('arabic')) | extraStopWords elif locale == 'da': stops = set(stopwords.words('danish')) | extraStopWords elif locale == 'en': stops = set(stopwords.words('english')) | extraStopWords elif locale == 'es': stops = set(stopwords.words('spanish')) | extraStopWords elif locale == 'hi': stops = extraStopWords elif locale == 'mr': stops = extraStopWords elif locale == 'nl': stops = set(stopwords.words('dutch')) | extraStopWords elif locale == 'sv': stops = set(stopwords.words('swedish')) | extraStopWords else: res = {"intents": "0", "utterances": "0"} response = str(res).replace("'", '"').strip() return response stemmer.setLocale(locale) tfidfVec = TfidfVectorizer(utterance, decode_error='ignore', stop_words=stops, ngram_range=(1, 5), tokenizer=stemmer.stemTokenize) trainsetIdfVectorizer = tfidfVec.fit_transform(utterance).toarray() vLength = len(trainsetIdfVectorizer[1]) nDimension = vectorDimension if vLength <= vectorDimension: nDimension = vLength - 1 svd = TruncatedSVD(n_components=nDimension, algorithm='randomized', n_iter=iterationNumbers, random_state=42) trainLSA = svd.fit_transform(trainsetIdfVectorizer) pickle_path = os.path.join(scriptDir, '..', '..', 'models', 'tfidf', domain + '_' + locale + '_') fileName = pickle_path + 'utterance.m' fileObject = open(fileName, 'wb') pickle.dump(utterance, fileObject) fileObject.close() fileName = pickle_path + 'intent.m' fileObject = open(fileName, 'wb') pickle.dump(intent, fileObject) fileObject.close() fileName = pickle_path + 'tfidfVec.m' fileObject = open(fileName, 'wb') pickle.dump(tfidfVec, fileObject) fileObject.close() fileName = pickle_path + 'svd.m' fileObject = open(fileName, 'wb') pickle.dump(svd, fileObject) fileObject.close() fileName = pickle_path + 'trainLSA.m' fileObject = open(fileName, 'wb') pickle.dump(trainLSA, fileObject) fileObject.close() log_util.loginfomsg(f'[TRAIN_TFIDF] identified domain: {domain}') log_util.loginfomsg(f'[TRAIN_TFIDF] identified locale: {locale}') log_util.loginfomsg( f'[TRAIN_TFIDF] number of utterances for training: {len(intent)}') log_util.loginfomsg( f'[TRAIN_TFIDF] number of intents for training: {len(mIntent)}') res = { "intents": str(len(mIntent)), "utterances": str(len(intent)), "model": "TFIDF" } response = str(res).replace("'", '"').strip() # make it a string return response
def readMessages(topic): global consumer consumer.subscribe(topic) for msg in consumer: log_util.loginfomsg(msg)
def train(domain, locale, prop): format = nlp_config.getParameter('FORMAT') dataFile = os.path.join(scriptDir, '..', '..', '..', 'trainingData', 'intents', domain + '_' + locale + '.' + format) configFile = os.path.join(scriptDir, '..', '..', 'config', nlp_config.getParameter('CONFIG_FILE')) modelFile = os.path.join(scriptDir, '..', '..', 'models', 'nlu') MODEL_NAME = domain + '_' + locale try: if format == 'md' or format == 'json' or format == 'yml': training_data = load_data(dataFile) trainer = Trainer(config.load(configFile)) if not nlp_config.is_config_stale(domain, locale, prop): trainer.train(training_data) #delete the folder if it exist if os.path.exists(modelFile + MODEL_NAME): shutil.rmtree(modelFile + MODEL_NAME) trainer.persist(modelFile, fixed_model_name=MODEL_NAME) else: log_util.loginfomsg( "[TRAIN_NLU] no changes found to training data, using pre-trained model" ) else: log_util.logerrormsg("[TRAIN_NLU] unsupported format. Exiting...") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response except FileNotFoundError: log_util.logerrormsg( "[TRAIN_NLU] could not locate the NLU config file") res = {"intents": "-1", "utterances": "-1"} response = str(res).replace("'", '"').strip() return response training_examples = OrderedDict() INTENT = 'intent' for example in [e.as_dict_nlu() for e in training_data.training_examples]: intent = example[INTENT] training_examples.setdefault(intent, []) training_examples[intent].append(example) count = 0 for x in training_examples: if isinstance(training_examples[x], list): count += len(training_examples[x]) log_util.loginfomsg(f'[TRAIN_NLU] identified domain: {domain}') log_util.loginfomsg(f'[TRAIN_NLU] identified locale: {locale}') log_util.loginfomsg( f'[TRAIN_NLU] number of utterances for training: {count}') log_util.loginfomsg( f'[TRAIN_NLU] number of intents for training: {len(training_examples)}' ) algo = os.path.splitext(nlp_config.getParameter('CONFIG_FILE'))[0] algo = algo.split("_")[1].upper() model = 'NLU:' + algo res = { "intents": str(len(training_examples)), "utterances": str(count), "model": model } response = str(res).replace("'", '"').strip() # make it a string return response
from utils import log_util from core import train_model, predict_model from pubsub import consumer from pubsub import processMessage # ignore all warnings from pubsub import producer as pr scriptDir = os.path.dirname(__file__) simplefilter(action='ignore') # load all the config parameters nlp_config.loadParameters() if re.search(nlp_config.getParameter('USE_BROKER'), 'true', re.IGNORECASE): log_util.loginfomsg("[APP] broker based NLPEngine enabled") # initialise the producer pr.initialise() # Run consumer listener to process all the NLP_TO_BOT messages consumer_ = consumer.initialise( nlp_config.getParameter('TOPIC_BOT_TO_NLP')) for msg in consumer_: log_util.loginfomsg(msg) t = threading.Thread(target=processMessage.process, args=(msg, )) t.start() else: log_util.loginfomsg("[APP] REST API based NLPEngine enabled") app = flask.Flask(__name__) scriptDir = os.path.dirname(__file__) SERVER_HOST = '0.0.0.0' SERVER_PORT = nlp_config.getParameter('PORT')
def process(message): log_util.loginfomsg('[PROCESS_MESSAGE]: message received with key: ' + message.key.decode('utf-8') + ' message: ' + str(message.value)) key = message.key.decode('utf-8') # check if the message is for training the NLP if utils.parseKey(key) == 0 and key.find('DUMMY') != -1: if 'messageId' in message.value and message.value[ 'messageId'] == 'TRAIN': domain = message.value['domain'] locale = message.value['locale'] log_util.loginfomsg( '[INTENT_ENGINE] training the NLP for domain:{} and locale:{}'. format(domain, locale)) res = train_model.train(domain, locale) n = int(json.loads(res)["utterances"]) if n > 0: res = { "messageId": "TRAIN_SUCCESS", "domain": domain, "locale": locale, "message": res } else: res = { "messageId": "TRAIN_FAIL", "domain": domain, "locale": locale, "message": res } producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key, json.dumps(res)) elif 'messageId' in message.value and message.value[ 'messageId'] == 'PREDICT': domain = message.value['domain'] locale = message.value['locale'] utterance = message.value['userUtterance'] log_util.loginfomsg( '[PROCESS_MESSAGE] predicting the utterance:{} for domain:{} and locale:{}' .format(utterance, domain, locale)) result = predict_model.predict(domain, locale, utterance) res = { "messageId": "PREDICT", "domain": domain, "locale": locale, "userUtterance": utterance, "message": result } producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key, json.dumps(res)) else: domain = message.value['domain'] locale = message.value['locale'] utterance = message.value['userUtterance'] log_util.loginfomsg( '[PROCESS_MESSAGE] processing the utterance:{} for domain:{} and locale:{}' .format(utterance, domain, locale)) result = predict_model.predict(domain, locale, utterance) res = { "messageId": "PREDICT", "domain": domain, "locale": locale, "userUtterance": utterance, "message": result } producer.sendMessgae(nlp_config.getParameter('TOPIC_NLP_TO_BOT'), key, json.dumps(res))