def ques_feats(): qs, intents, orig = get_qs(False, 'lskdn') # file=pd.read_csv('Q_F2f.csv', na_filter=False) questions = qs # original=file.text # print(questions) output = { 'Text': [], 'Extracted Question': [], 'Question Features': [], "Message Features": [], "Intent": [] } ctr = 0 for q, i in zip(questions, intents): ma, mfm = new_feature_engineering_test(orig[ctr]) output['Message Features'].append(mfm) output['Text'].append(orig[ctr]) output['Intent'].append(i) if q.strip(): print(q) all_features, feature_matrix = new_feature_engineering_test(q) # output['Text'].append(orig[ctr]) output['Extracted Question'].append(q) output['Question Features'].append(feature_matrix) else: # output['Text'].append(orig[ctr]) output['Extracted Question'].append("None") output['Question Features'].append("None") ctr = ctr + 1 opt = pd.DataFrame(output) # opt=pd.DataFrame(opt.groupby('Text')) opt.to_csv('check.csv')
def testing(path=None): if path and path.endswith('.csv') is False: sys.exit( "Please use valid .csv extension for training. Void argument for default training data. Breaking" ) questions, intents, bak = get_qs(None, path) ctr = 0 for question, lab_cur in zip(questions, intents): all_features, feature_matrix = new_feature_engineering_test(question) log_features(all_features, feature_matrix, lab_cur) lab_itr = intents[ctr + 1] ctr = ctr + 1 if lab_cur != lab_itr: global tp global fn print("%s accuracy:- %s" % (lab_cur, float(tp / (tp + fn)))) print("*********************") print("NEW INTENT - %s" % (lab_itr)) tp = 0 fn = 0 lab_cur = lab_itr print('**********************') # ctr=ctr+1 print("INTENT - %s" % (lab_cur))
def ques_feats(path=None): # qs, intents, orig=get_qs(False,'/home/lexica/chatbot/intent/training_data/msg.txt','/home/lexica/chatbot/intent/training_data/intent.txt') if path and path.endswith('.csv') is False: sys.exit( "Please use valid .csv extension for training. Void argument for default training data. Breaking" ) questions, intents, orig = get_qs(False, path) output = { 'Text': [], 'Extracted Question': [], 'Question Features': [], "Message Features": [], "Intent": [] } ctr = 0 for q, i in zip(questions, intents): ma, mfm = new_feature_engineering_test(orig[ctr]) output['Message Features'].append(mfm) output['Text'].append(orig[ctr]) output['Intent'].append(i) if q.strip(): print(q) all_features, feature_matrix = new_feature_engineering_test(q) output['Extracted Question'].append(q) output['Question Features'].append(feature_matrix) else: output['Extracted Question'].append("None") output['Question Features'].append("None") ctr = ctr + 1 opt = pd.DataFrame(output) opt.to_csv('reports/check.csv') splice_c = ['Intent', 'Message Features', 'Question Features'] r_in = opt[splice_c] out_file = open('reports/out_results.txt', 'w') for tent in r_in[splice_c[0]].unique(): out_file.write(tent + '\n*******************************\n\n') splice_tent = r_in[r_in[splice_c[0]] == tent] for col in splice_c[1:]: out_file.write(col + '\n-------------------------\n\n') comb = [x.strip() for y in splice_tent[col] for x in y] count_dict = sorted(dict(Counter(comb)).items(), key=lambda i: i[1], reverse=True) for_frame = [{ 'w': word, 'f': freq } for word, freq in dict(count_dict).items()] df_to_str = pd.DataFrame(for_frame)[['w', 'f']].sort_values( 'f', ascending=False).reset_index(drop=True).to_string() out_file.write(df_to_str) out_file.write('\n\n')
def user_question(): # question=input("Enter question\n") # all_features, feature_matrix = new_feature_engineering(question) # log_features(all_features,feature_matrix) # ctr=1 # file2=open('/home/lexica/chatbot/intent/training_data/is.txt','r') # with open('/home/lexica/chatbot/intent/training_data/qs.txt','r') as file: # lab_cur=linecache.getline('/home/lexica/chatbot/intent/training_data/is.txt',ctr,module_globals=None) # lab_cur=lab_cur.strip() # lab_itr=lab_cur # for question in file: # ctr=ctr+1 # lab_itr=linecache.getline('/home/lexica/chatbot/intent/training_data/is.txt',ctr,module_globals=None) # lab_itr=lab_itr.strip() # question=pre_proc(question) # if lab_itr=='': # print("empty %s" %(ctr)) # continue # if question=='': # # ctr=ctr+1 # print('OooOoOoooooOoo') # print(ctr) # continue # # dialogues=open('training_data/questions.txt', 'r').readlines() # # questions = [s.strip() for s in dialogues if s.strip()] # print('**********************') # print("INTENT - %s" %(lab_cur)) questions, intents = get_qs() ctr = 0 for question, lab_cur in zip(questions, intents): all_features, feature_matrix = new_feature_engineering(question) log_features(all_features, feature_matrix, lab_cur) lab_itr = intents[ctr + 1] ctr = ctr + 1 if lab_cur != lab_itr: global tp global fn print("%s accuracy:- %s" % (lab_cur, float(tp / (tp + fn)))) print("*********************") print("NEW INTENT - %s" % (lab_itr)) tp = 0 fn = 0 lab_cur = lab_itr print('**********************') # ctr=ctr+1 print("INTENT - %s" % (lab_cur))
def ques_feats(): qpath='/home/lexica/chatbot/intent/training_data/msg.txt' ipath='/home/lexica/chatbot/intent/training_data/intents_new.txt' questions, intents,orig=get_qs(True,qpath,ipath) # print(questions) output={'Text':[],'Prepoc Text':[], 'Features':[], 'Intent':[], } for i,q in enumerate(questions): if q!="": print(q) all_features, feature_matrix = new_feature_engineering(q) output['Prepoc Text'].append(q) output['Features'].append(feature_matrix) output['Intent'].append('') output['Text'].append(orig[i]) opt=pd.DataFrame(output) opt.to_csv('Q_F2f.csv')
def master(ext, path=None): if path and path.endswith('.csv') is False: sys.exit( "Please use valid .csv extension for training. Void argument for default training data. Breaking" ) suffix = input("Enter file suffix \n") if ext: questions, intents, bak = get_qs(False, 'training_data/csv/big_dataset.csv') else: questions, intents, bak = get_qs(True, 'training_data/csv/big_dataset.csv') unique_intents = list(set(intents)) print(unique_intents) print(list(zip(questions, intents))) Y = [] for i in intents: Y.append(unique_intents.index(i)) # dialogues, mm = nlp_preprocess(questions) # all_features, feature_matrix = feature_engineering(dialogues) # dialogues=[pre_proc(line) for line in questions] all_features, feature_matrix = new_feature_engineering_train(questions) X = [] for q in feature_matrix: x = [] for f in all_features: if f in q: x.append(1) else: x.append(0) X.append(x) print(len(X), len(intents)) print('Logistic Regression') logistic_clf = LogisticRegression() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42) logistic_clf.fit(X_train, Y_train) cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0) scores = cross_val_score(logistic_clf, X, Y, cv=cv) score = logistic_clf.score(X_test, Y_test) print(scores) print(score) print('RandomForest') random_clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0) random_clf.fit(X, Y) paths = [ "models/original/intent-logistic-classifier-" + str(suffix) + ".pickle", "models/original/intent-random-classifier-" + str(suffix) + ".pickle", "models/original/intent-index-" + str(suffix) + ".txt", 'models/original/all_features' + str(suffix) + '.txt' ] f = open(paths[0], "wb") pickle.dump(logistic_clf, f) ff = open(paths[1], "wb") pickle.dump(random_clf, ff) intent_index_file = open(paths[2], 'w+') intent_index_file.write('\n'.join(unique_intents)) intent_index_file.close() all_feature_file = open(paths[3], 'w+') all_feature_file.write('\n'.join(all_features)) all_feature_file.close() #choice for suffix, choice for training on full sentence or only on sentence # s_questions = open('training_data/questions_with_slot.txt', 'r').readlines() # s_intents = open('training_data/intents_with_slot.txt', 'r').readlines() # s_questions = [s.strip() for s in s_questions if s.strip()] # s_intents = [s.strip() for s in s_intents if s.strip()] # unique_intents = list(set(s_intents)) # print(unique_intents) # Y = [] # for i in s_intents: # Y.append(unique_intents.index(i)) # dialogues, mm = nlp_preprocess(s_questions) # all_features, feature_matrix = feature_engineering(dialogues) # X = [] # for q in feature_matrix: # x = [] # for f in all_features: # if f in q: # x.append(1) # else: # x.append(0) # X.append(x) # print('Slot Question Logistic Regression') # logistic_clf = LogisticRegression() # logistic_clf.fit(X, Y) # print('Slot Question RandomForest') # random_clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0) # random_clf.fit(X, Y) # f = open("intents/labels/intent-logistic-classifier.pickle", "wb") # pickle.dump(logistic_clf , f) # ff = open("intents/labels/intent-random-classifier.pickle", "wb") # pickle.dump(random_clf, ff) # intent_index_file = open("intents/labels/intent-index.txt", 'w+') # intent_index_file.write('\n'.join(unique_intents)) # intent_index_file.close() # all_feature_file = open('intents/labels/all_features.txt', 'w+') # all_feature_file.write('\n'.join(all_features)) # all_feature_file.close() # def original_predict(dialogue_tuple): # print("Original predict") # target_features, target_feature_matrix = feature_engineering(dialogue_tuple) # target_x = [] # for tf in target_feature_matrix: # tx = [] # for f in original_all_features: # if f in tf: # tx.append(1) # else: # tx.append(0) # target_x.append(tx) # random_result = original_random_clf.predict_proba(target_x) # logistic_result = original_logistic_clf.predict_proba(target_x) # resp_dict = {} # resp_list = [] # for ri in range(len(logistic_result[0])): # resp_dict[original_unique_intents[ri]] = (random_result[0][ri], logistic_result[0][ri]) # resp_list.append({"intent": original_unique_intents[ri], "random": random_result[0][ri], "logistic": logistic_result[0][ri], "score": (random_result[0][ri] + logistic_result[0][ri]) / 2}) # sorted_resp_list = sorted(resp_list, key=lambda k: k["score"]) # sorted_resp_list.reverse() # print(sorted_resp_list) # for ri in range(len(sorted_resp_list)): # sorted_resp_list[ri]['random'] = str(sorted_resp_list[ri]['random']) # sorted_resp_list[ri]['logistic'] = str(sorted_resp_list[ri]['logistic']) # sorted_resp_list[ri]['score'] = str(sorted_resp_list[ri]['score']) # return sorted_resp_list # def labels_predict(sentence_tokens, dialogue_tuple): # print("Label predict") # actual_tokens = [] # for sent in sentence_tokens: # for token in sent['tokens']: # if token['lemma'].lower() == 'book' and token['pos'].startswith('V'): # actual_tokens.append('|'.join(list(token['originalText'].lower()))) # else: # actual_tokens.append(token['originalText'].lower()) # testing_question = ' ' + ' '.join(actual_tokens) + ' ' # for item in sortedlist: # if testing_question.find(' '+item[0]+' ') > -1: # testing_question = testing_question.replace(' '+item[0]+' ', ' '+item[1]+' ') # testing_question = testing_question.replace('|', '') # testing_question_tokens = testing_question.split() # gc = 0 # new_list_for_sentences = [] # for ds in range(len(dialogue_tuple[0])): # new_list_for_sentence = [] # for ts in range(len(dialogue_tuple[0][ds])): # pos = dialogue_tuple[0][ds][ts][0].split('_')[0] # word = dialogue_tuple[0][ds][ts][0].split('_')[1] # new_list_for_sentence.append((pos + '_' + testing_question_tokens[gc], dialogue_tuple[0][ds][ts][1], dialogue_tuple[0][ds][ts][2])) # gc += 1 # new_list_for_sentences.append(new_list_for_sentence) # print(new_list_for_sentences) # target_features, target_feature_matrix = feature_engineering([new_list_for_sentences]) # target_x = [] # for tf in target_feature_matrix: # tx = [] # for f in label_all_features: # if f in tf: # tx.append(1) # else: # tx.append(0) # target_x.append(tx) # random_result = label_random_clf.predict_proba(target_x) # logistic_result = label_logistic_clf.predict_proba(target_x) # resp_dict = {} # resp_list = [] # for ri in range(len(logistic_result[0])): # resp_dict[label_unique_intents[ri]] = (random_result[0][ri], logistic_result[0][ri]) # resp_list.append({"intent": label_unique_intents[ri], "random": random_result[0][ri], "logistic": logistic_result[0][ri], "score": (random_result[0][ri] + logistic_result[0][ri]) / 2}) # sorted_resp_list = sorted(resp_list, key=lambda k: k["score"]) # sorted_resp_list.reverse() # print(sorted_resp_list) # for ri in range(len(sorted_resp_list)): # sorted_resp_list[ri]['random'] = str(sorted_resp_list[ri]['random']) # sorted_resp_list[ri]['logistic'] = str(sorted_resp_list[ri]['logistic']) # sorted_resp_list[ri]['score'] = str(sorted_resp_list[ri]['score']) # return sorted_resp_list # def predict(target_message): # # print(target_message) # pre, stanford_responses = nlp_preprocess([target_message]) # stanford_response = stanford_responses[0] # label_result = labels_predict(stanford_response, pre) # original_result = original_predict(pre) # print(label_result[:3]) # print(original_result[:3]) # return label_result, original_result, stanford_responses # predict('I am looking for visting the law library this coming thursday and next tuesday. Can I')