Пример #1
0
def ques_feats():
    qs, intents, orig = get_qs(False, 'lskdn')
    # file=pd.read_csv('Q_F2f.csv', na_filter=False)
    questions = qs
    # original=file.text
    # print(questions)
    output = {
        'Text': [],
        'Extracted Question': [],
        'Question Features': [],
        "Message Features": [],
        "Intent": []
    }
    ctr = 0
    for q, i in zip(questions, intents):
        ma, mfm = new_feature_engineering_test(orig[ctr])
        output['Message Features'].append(mfm)
        output['Text'].append(orig[ctr])
        output['Intent'].append(i)
        if q.strip():
            print(q)
            all_features, feature_matrix = new_feature_engineering_test(q)
            # output['Text'].append(orig[ctr])
            output['Extracted Question'].append(q)
            output['Question Features'].append(feature_matrix)
        else:
            # output['Text'].append(orig[ctr])
            output['Extracted Question'].append("None")
            output['Question Features'].append("None")
        ctr = ctr + 1
    opt = pd.DataFrame(output)
    # opt=pd.DataFrame(opt.groupby('Text'))
    opt.to_csv('check.csv')
Пример #2
0
def testing(path=None):
    if path and path.endswith('.csv') is False:
        sys.exit(
            "Please use valid .csv extension for training. Void argument for default training data. Breaking"
        )
    questions, intents, bak = get_qs(None, path)
    ctr = 0
    for question, lab_cur in zip(questions, intents):
        all_features, feature_matrix = new_feature_engineering_test(question)
        log_features(all_features, feature_matrix, lab_cur)
        lab_itr = intents[ctr + 1]
        ctr = ctr + 1
        if lab_cur != lab_itr:
            global tp
            global fn
            print("%s accuracy:- %s" % (lab_cur, float(tp / (tp + fn))))
            print("*********************")
            print("NEW INTENT - %s" % (lab_itr))
            tp = 0
            fn = 0
            lab_cur = lab_itr

        print('**********************')
        # ctr=ctr+1

        print("INTENT - %s" % (lab_cur))
Пример #3
0
def ques_feats(path=None):
    # qs, intents, orig=get_qs(False,'/home/lexica/chatbot/intent/training_data/msg.txt','/home/lexica/chatbot/intent/training_data/intent.txt')
    if path and path.endswith('.csv') is False:
        sys.exit(
            "Please use valid .csv extension for training. Void argument for default training data. Breaking"
        )
    questions, intents, orig = get_qs(False, path)
    output = {
        'Text': [],
        'Extracted Question': [],
        'Question Features': [],
        "Message Features": [],
        "Intent": []
    }
    ctr = 0
    for q, i in zip(questions, intents):
        ma, mfm = new_feature_engineering_test(orig[ctr])
        output['Message Features'].append(mfm)
        output['Text'].append(orig[ctr])
        output['Intent'].append(i)
        if q.strip():
            print(q)
            all_features, feature_matrix = new_feature_engineering_test(q)
            output['Extracted Question'].append(q)
            output['Question Features'].append(feature_matrix)
        else:
            output['Extracted Question'].append("None")
            output['Question Features'].append("None")
        ctr = ctr + 1
    opt = pd.DataFrame(output)
    opt.to_csv('reports/check.csv')
    splice_c = ['Intent', 'Message Features', 'Question Features']
    r_in = opt[splice_c]
    out_file = open('reports/out_results.txt', 'w')
    for tent in r_in[splice_c[0]].unique():
        out_file.write(tent + '\n*******************************\n\n')
        splice_tent = r_in[r_in[splice_c[0]] == tent]
        for col in splice_c[1:]:
            out_file.write(col + '\n-------------------------\n\n')
            comb = [x.strip() for y in splice_tent[col] for x in y]
            count_dict = sorted(dict(Counter(comb)).items(),
                                key=lambda i: i[1],
                                reverse=True)
            for_frame = [{
                'w': word,
                'f': freq
            } for word, freq in dict(count_dict).items()]
            df_to_str = pd.DataFrame(for_frame)[['w', 'f']].sort_values(
                'f', ascending=False).reset_index(drop=True).to_string()
            out_file.write(df_to_str)
            out_file.write('\n\n')
Пример #4
0
def user_question():
    # question=input("Enter question\n")
    # all_features, feature_matrix = new_feature_engineering(question)
    # log_features(all_features,feature_matrix)
    # ctr=1
    # file2=open('/home/lexica/chatbot/intent/training_data/is.txt','r')
    # with open('/home/lexica/chatbot/intent/training_data/qs.txt','r') as file:

    #     lab_cur=linecache.getline('/home/lexica/chatbot/intent/training_data/is.txt',ctr,module_globals=None)
    #     lab_cur=lab_cur.strip()
    #     lab_itr=lab_cur
    #     for question in file:
    #         ctr=ctr+1
    #         lab_itr=linecache.getline('/home/lexica/chatbot/intent/training_data/is.txt',ctr,module_globals=None)
    #         lab_itr=lab_itr.strip()
    #         question=pre_proc(question)
    #         if lab_itr=='':
    #             print("empty %s" %(ctr))
    #             continue
    #         if question=='':
    #             # ctr=ctr+1
    #             print('OooOoOoooooOoo')
    #             print(ctr)
    #             continue
    #         # dialogues=open('training_data/questions.txt', 'r').readlines()
    #         # questions = [s.strip() for s in dialogues if s.strip()]
    #         print('**********************')
    #         print("INTENT - %s" %(lab_cur))
    questions, intents = get_qs()
    ctr = 0
    for question, lab_cur in zip(questions, intents):
        all_features, feature_matrix = new_feature_engineering(question)
        log_features(all_features, feature_matrix, lab_cur)
        lab_itr = intents[ctr + 1]
        ctr = ctr + 1
        if lab_cur != lab_itr:
            global tp
            global fn
            print("%s accuracy:- %s" % (lab_cur, float(tp / (tp + fn))))
            print("*********************")
            print("NEW INTENT - %s" % (lab_itr))
            tp = 0
            fn = 0
            lab_cur = lab_itr

        print('**********************')
        # ctr=ctr+1

        print("INTENT - %s" % (lab_cur))
Пример #5
0
def ques_feats():
    qpath='/home/lexica/chatbot/intent/training_data/msg.txt'
    ipath='/home/lexica/chatbot/intent/training_data/intents_new.txt'
    questions, intents,orig=get_qs(True,qpath,ipath)
    # print(questions)
    output={'Text':[],'Prepoc Text':[], 'Features':[], 'Intent':[], }
    for i,q in enumerate(questions):
        if q!="":
            print(q)
            all_features, feature_matrix = new_feature_engineering(q)
            output['Prepoc Text'].append(q)
            output['Features'].append(feature_matrix)
            output['Intent'].append('')
            output['Text'].append(orig[i])
    opt=pd.DataFrame(output)
    opt.to_csv('Q_F2f.csv')
Пример #6
0
def master(ext, path=None):
    if path and path.endswith('.csv') is False:
        sys.exit(
            "Please use valid .csv extension for training. Void argument for default training data. Breaking"
        )
    suffix = input("Enter file suffix \n")
    if ext:
        questions, intents, bak = get_qs(False,
                                         'training_data/csv/big_dataset.csv')
    else:
        questions, intents, bak = get_qs(True,
                                         'training_data/csv/big_dataset.csv')
    unique_intents = list(set(intents))
    print(unique_intents)
    print(list(zip(questions, intents)))
    Y = []
    for i in intents:
        Y.append(unique_intents.index(i))

    # dialogues, mm = nlp_preprocess(questions)
    # all_features, feature_matrix = feature_engineering(dialogues)
    # dialogues=[pre_proc(line) for line in questions]
    all_features, feature_matrix = new_feature_engineering_train(questions)
    X = []
    for q in feature_matrix:
        x = []
        for f in all_features:
            if f in q:
                x.append(1)
            else:
                x.append(0)
        X.append(x)
    print(len(X), len(intents))
    print('Logistic Regression')
    logistic_clf = LogisticRegression()
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        random_state=42)
    logistic_clf.fit(X_train, Y_train)
    cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
    scores = cross_val_score(logistic_clf, X, Y, cv=cv)
    score = logistic_clf.score(X_test, Y_test)
    print(scores)
    print(score)
    print('RandomForest')
    random_clf = RandomForestClassifier(n_estimators=1000,
                                        max_depth=None,
                                        min_samples_split=2,
                                        random_state=0)
    random_clf.fit(X, Y)

    paths = [
        "models/original/intent-logistic-classifier-" + str(suffix) +
        ".pickle",
        "models/original/intent-random-classifier-" + str(suffix) + ".pickle",
        "models/original/intent-index-" + str(suffix) + ".txt",
        'models/original/all_features' + str(suffix) + '.txt'
    ]
    f = open(paths[0], "wb")
    pickle.dump(logistic_clf, f)

    ff = open(paths[1], "wb")
    pickle.dump(random_clf, ff)

    intent_index_file = open(paths[2], 'w+')
    intent_index_file.write('\n'.join(unique_intents))
    intent_index_file.close()

    all_feature_file = open(paths[3], 'w+')
    all_feature_file.write('\n'.join(all_features))
    all_feature_file.close()


#choice for suffix, choice for training on full sentence or only on sentence

# s_questions = open('training_data/questions_with_slot.txt', 'r').readlines()
# s_intents = open('training_data/intents_with_slot.txt', 'r').readlines()
# s_questions = [s.strip() for s in s_questions if s.strip()]
# s_intents = [s.strip() for s in s_intents if s.strip()]

# unique_intents = list(set(s_intents))
# print(unique_intents)
# Y = []
# for i in s_intents:
# 	Y.append(unique_intents.index(i))

# dialogues, mm = nlp_preprocess(s_questions)
# all_features, feature_matrix = feature_engineering(dialogues)

# X = []
# for q in feature_matrix:
# 	x = []
# 	for f in all_features:
# 		if f in q:
# 			x.append(1)
# 		else:
# 			x.append(0)
# 	X.append(x)

# print('Slot Question Logistic Regression')
# logistic_clf = LogisticRegression()
# logistic_clf.fit(X, Y)
# print('Slot Question RandomForest')
# random_clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)
# random_clf.fit(X, Y)

# f = open("intents/labels/intent-logistic-classifier.pickle", "wb")
# pickle.dump(logistic_clf , f)

# ff = open("intents/labels/intent-random-classifier.pickle", "wb")
# pickle.dump(random_clf, ff)

# intent_index_file = open("intents/labels/intent-index.txt", 'w+')
# intent_index_file.write('\n'.join(unique_intents))
# intent_index_file.close()

# all_feature_file = open('intents/labels/all_features.txt', 'w+')
# all_feature_file.write('\n'.join(all_features))
# all_feature_file.close()

# def original_predict(dialogue_tuple):
# 	print("Original predict")
# 	target_features, target_feature_matrix = feature_engineering(dialogue_tuple)
# 	target_x = []
# 	for tf in target_feature_matrix:
# 		tx = []
# 		for f in original_all_features:
# 			if f in tf:
# 				tx.append(1)
# 			else:
# 				tx.append(0)
# 		target_x.append(tx)
# 	random_result = original_random_clf.predict_proba(target_x)
# 	logistic_result = original_logistic_clf.predict_proba(target_x)
# 	resp_dict = {}
# 	resp_list = []
# 	for ri in range(len(logistic_result[0])):
# 		resp_dict[original_unique_intents[ri]] = (random_result[0][ri], logistic_result[0][ri])
# 		resp_list.append({"intent": original_unique_intents[ri], "random": random_result[0][ri], "logistic": logistic_result[0][ri], "score": (random_result[0][ri] + logistic_result[0][ri]) / 2})
# 	sorted_resp_list = sorted(resp_list, key=lambda k: k["score"])
# 	sorted_resp_list.reverse()
# 	print(sorted_resp_list)
# 	for ri in range(len(sorted_resp_list)):
# 		sorted_resp_list[ri]['random'] = str(sorted_resp_list[ri]['random'])
# 		sorted_resp_list[ri]['logistic'] = str(sorted_resp_list[ri]['logistic'])
# 		sorted_resp_list[ri]['score'] = str(sorted_resp_list[ri]['score'])
# 	return sorted_resp_list

# def labels_predict(sentence_tokens, dialogue_tuple):
# 	print("Label predict")
# 	actual_tokens = []
# 	for sent in sentence_tokens:
# 		for token in sent['tokens']:
# 			if token['lemma'].lower() == 'book' and token['pos'].startswith('V'):
# 				actual_tokens.append('|'.join(list(token['originalText'].lower())))
# 			else:
# 				actual_tokens.append(token['originalText'].lower())

# 	testing_question = ' ' + ' '.join(actual_tokens) + ' '
# 	for item in sortedlist:
# 		if testing_question.find(' '+item[0]+' ') > -1:
# 			testing_question = testing_question.replace(' '+item[0]+' ', ' '+item[1]+' ')
# 	testing_question = testing_question.replace('|', '')
# 	testing_question_tokens = testing_question.split()
# 	gc = 0
# 	new_list_for_sentences = []
# 	for ds in range(len(dialogue_tuple[0])):
# 		new_list_for_sentence = []
# 		for ts in range(len(dialogue_tuple[0][ds])):
# 			pos = dialogue_tuple[0][ds][ts][0].split('_')[0]
# 			word = dialogue_tuple[0][ds][ts][0].split('_')[1]
# 			new_list_for_sentence.append((pos + '_' + testing_question_tokens[gc], dialogue_tuple[0][ds][ts][1], dialogue_tuple[0][ds][ts][2]))
# 			gc += 1
# 		new_list_for_sentences.append(new_list_for_sentence)

# 	print(new_list_for_sentences)
# 	target_features, target_feature_matrix = feature_engineering([new_list_for_sentences])
# 	target_x = []
# 	for tf in target_feature_matrix:
# 		tx = []
# 		for f in label_all_features:
# 			if f in tf:
# 				tx.append(1)
# 			else:
# 				tx.append(0)
# 		target_x.append(tx)
# 	random_result = label_random_clf.predict_proba(target_x)
# 	logistic_result = label_logistic_clf.predict_proba(target_x)
# 	resp_dict = {}
# 	resp_list = []
# 	for ri in range(len(logistic_result[0])):
# 		resp_dict[label_unique_intents[ri]] = (random_result[0][ri], logistic_result[0][ri])
# 		resp_list.append({"intent": label_unique_intents[ri], "random": random_result[0][ri], "logistic": logistic_result[0][ri], "score": (random_result[0][ri] + logistic_result[0][ri]) / 2})
# 	sorted_resp_list = sorted(resp_list, key=lambda k: k["score"])
# 	sorted_resp_list.reverse()
# 	print(sorted_resp_list)
# 	for ri in range(len(sorted_resp_list)):
# 		sorted_resp_list[ri]['random'] = str(sorted_resp_list[ri]['random'])
# 		sorted_resp_list[ri]['logistic'] = str(sorted_resp_list[ri]['logistic'])
# 		sorted_resp_list[ri]['score'] = str(sorted_resp_list[ri]['score'])
# 	return sorted_resp_list

# def predict(target_message):
# 	# print(target_message)
# 	pre, stanford_responses = nlp_preprocess([target_message])
# 	stanford_response = stanford_responses[0]

# 	label_result = labels_predict(stanford_response, pre)
# 	original_result = original_predict(pre)
# 	print(label_result[:3])
# 	print(original_result[:3])
# 	return label_result, original_result, stanford_responses

# predict('I am looking for visting the law library this coming thursday and next tuesday. Can I')