def tfidf_word_feats(word_matrix, ngrams=(2, )): docs_by_word = [' '.join(pred_list) for pred_list in word_matrix] # by individual word tfidf_word_results = [] tfidf_wrd = TfidfVectorizer(tokenizer=tokenize()) word_scores = tfidf_wrd.fit_transform(docs_by_word).toarray() word_feat_names = tfidf_wrd.get_feature_names() for i, row in enumerate(word_scores): tfidf_word_results.append({ pred_name: score for score, pred_name in zip(row, word_feat_names) if score > 0 }) # by n-gram, for multiple n tfidf_ng_results = [] # FIXME need to merge dicts over all n for n in ngrams: tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(n, n)) char_scores = tfidf_char.fit_transform(docs_by_word).toarray() char_feat_names = tfidf_char.get_feature_names() for i, row in enumerate(char_scores): tfidf_ng_results.append({ ng: score for score, ng in zip(row, char_feat_names) if score > 0 and ' ' not in ng }) return list(zip(tfidf_word_results, tfidf_ng_results))
def load_20newsgroups(validation_ratio, normalization): """Load text network (20 news group) Arguments: validation_ratio (float): Ratio of validation split normalization (str): Variant of normalization method to use. Returns: adj (chainer.utils.sparse.CooMatrix): (Node, Node) shape normalized adjency matrix. labels (np.ndarray): (Node, ) shape labels array idx_train (np.ndarray): Indices of the train idx_val (np.ndarray): Indices of val array idx_test (np.ndarray): Indices of test array """ train = fetch_20newsgroups(subset='train') test = fetch_20newsgroups(subset='test') adj = create_text_adjacency_matrix( [tokenize(t) for t in (train['data'] + test['data'])]) if normalization == 'gcn': adj = normalize(adj) else: adj = normalize_pygcn(adj) n_train = int(len(train['data']) * (1.0 - validation_ratio)) n_all = len(train['data']) + len(test['data']) idx_train = np.array(list(range(n_train)), np.int32) idx_val = np.array(list(range(n_train, len(train['data']))), np.int32) idx_test = np.array(list(range(len(train['data']), n_all)), np.int32) labels = np.concatenate( (train['target'], test['target'], np.full([adj.shape[0] - n_all], -1))) labels = labels.astype(np.int32) adj = to_chainer_sparse_variable(adj) return adj, labels, idx_train, idx_val, idx_test
def imojify_input(line, src_lang="en"): line = line.lower() sents = nlp_utils.tokenize(line) imojified = [] for s in sents: imojified.append(imojify_sentence(nlp_utils.stem_tokens(s, src_lang), src_lang)) return imojified
def read_seq_dataset_from_file(filename, max_vocab_size=1000000, min_count=0, unk_tk=UNK_TK, start_tk=START_TK, decode_tk=DECODE_TK, end_tk=END_TK, tokenize=True): """Get the sequences and vocab from a file. Args: filename: name of file. max_vocab_size: the maximum number of tokens in the vocab. min_count: the minimum number of appearance for a token to be added into the vocab. unk_tk: the unknown token. start_tk: the start of sentence token. decode_tk: the start of decoding token. end_tk: the end of decoding token. tokenize: Whether to tokenize the text in the file. Returns: seqs: a list of lists of tokens. vocab: a Vocab object created from the file. """ vocab = generate_vocab_from_file(filename, tokenize=tokenize, max_vocab_size=max_vocab_size, min_count=min_count, unk_tk=unk_tk, start_tk=start_tk, decode_tk=decode_tk, end_tk=end_tk) seqs = [] with open(filename, 'r') as f: for line in f: if tokenize: tokens = nlp_utils.tokenize(line) else: tokens = line.strip().split() seqs.append(tokens) return seqs, vocab
def generate_vocab_from_stream(text_stream, max_vocab_size=1000000, min_count=0, unk_tk=UNK_TK, start_tk=START_TK, decode_tk=DECODE_TK, end_tk=END_TK, tokenize=True): """Create a vocab from a given text stream.""" token_list = [] for line in text_stream: if tokenize: new_list = nlp_utils.tokenize(line) else: new_list = line.strip().split() token_list += new_list return generate_vocab_from_list(token_list, max_vocab_size=max_vocab_size, min_count=min_count, unk_tk=unk_tk, start_tk=start_tk, decode_tk=decode_tk, end_tk=end_tk)
vector[ind - 1] = 1 return np.array(vector) def word_vectorize_doc(doc: str, vector_size=10_000, final_doc_len=100): ''' Returns an array of word vectors for a given document ============================================= Inputs: - doc : The document to vectorize - vector_size : The size of the vectors for each word - final_doc_len: The length of the array of vectors Returns: - A numpy array of vectors ''' tokenized_doc = tokenize(doc) # Tokenize document if len(tokenized_doc) > final_doc_len: tokenized_doc = tokenized_doc[:final_doc_len] # Create an ordered sequence of vectors vectors = [] for word in tokenized_doc: vectors.append(word_vectorize(word, vector_size=vector_size)) while (len(vectors) < final_doc_len): vectors.append(np.zeros(vector_size)) return np.array(vectors)
tags = data["tags"] model = NeuralNet(input_size,hidden_size,output_size) model.load_state_dict(model_state) # call model.eval() to set dropout and batch normalization layers to evaluation mode before running inference. Failing to do this will yield inconsistent inference results. # acts like a switch to turn of some layers during evaluation/inference # this is evaluation mode model.eval() bot_name = "Yukti's Bot" print("Let's chat! Type 'quit' to exit..") while True: sentence = input('You: ') if sentence == "quit": break sentence = tokenize(sentence) # bagofwords function returns a numpy array X = bagOfWords(sentence,all_words) X = X.reshape(1,X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _,predicted = torch.max(output,dim=1) # predicted.item - class label tag = tags[predicted.item()] # checking if the probability of the tag is high enough # applying softmax to get the actual probabilities probs = torch.softmax(output,dim=1) prob = probs[0][predicted.item()] if (prob.item() > 0.70): # finding corresponding intent for this tag
def question(sentence, userName, userID): sentence = tokenize(sentence) ignore_words = [ '?', '!', '.', '"', '@', '#', '^', '=', '-', ',', '/', '*', '$', '&', '(', ')', ' ' ] sentence = [w for w in sentence if w not in ignore_words] X = bag_of_words(sentence, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.9: print(prob.item()) for intent in intents['intents']: if tag == intent["tag"]: # print(tag) answer = random.choice(intent['responses']) result = None if tag == 'cancel' or tag == 'postpone': result = fb_Data.get('/members', '/' + userID + '/appointment') if result is None: answer = 'คุณ(customer_name) ยังไม่ได้ทำการจองนัดหมายกรุณาจองนัดก่อนค่ะ \nคลิกที่เมนูเพื่อจองนัด' tag = 'appointment' if '(name)' in answer: name = "" for dent in dentist['Dentist']: for dayWeek in dent['OnDuty']: if dayWeek == datetime.datetime.now().strftime( "%a"): #วันตรงกัน name = name + "คุณหมอ" + dent['Name'] + " " answer = answer.replace('(name)', name) if '(customer_name)' in answer: answer = answer.replace('(customer_name)', userName) if '(date)' in answer or '(time)' in answer: if result is not None: for i, day in enumerate(result): if i + 1 == (len(result)): time_str = result[day]['time'] date_time_str = result[day]['date'] date_time_obj = datetime.datetime.strptime( date_time_str, '%Y-%m-%d') date_time_format = date_time_obj.strftime( '%d-%m-%Y') answer = answer.replace('(date)', date_time_format).replace( '(time)', time_str) else: time = datetime.datetime.now() answer = answer.replace('(date)', time.strftime("%x")).replace( '(time)', time.strftime("%X")) if '(list)' in answer or '(price)' in answer: for dental in dental_lists["dental_lists"]: for a in dental["homonyms"]: if a in sentence: list1 = (dental["homonyms"][0]) price = (dental["cost"]) print(list1, price) answer = answer.replace('(list)', list1).replace( '(price)', str(price)) break if '(list)' in answer or '(price)' in answer: answer = 'ราคา' return answer, tag else: return "ยิ้มสวยไม่เข้าใจค่ะ ลองถามใหม่อีกครั้งค่ะ", "" # # test case # test1 = ['สวัสดี', 'สวัสดีครับผม', "สบายดีไหม"] # test2 = ['นัดทำฟัน', 'ขอทำฟัน', 'ขอไม่นัดหมอนะ'] ## test3 = ['นัดอุดฟัน', 'อยากทำวีเนียร์', 'อยากจัดฟัน', 'ถอนฟัน', 'นัดเอ็กซเรย์', 'ทำฟันปลอมค่ะ', 'จะฟอกฟัน', 'ทำแอร์โฟลว', 'นัดทำเลเซอร์','นัดทั่วไป', 'ทำรากฟันเทียม', 'ฟันคุดค่ะ', 'พิมพ์ปากถ่ายรูป', 'ไม่นัดทำเอกซเรย์แล้ว'] # test4 = ['สอบถามราคา', 'จัดฟันราคาเท่าไหร่', 'ขัดหินปูนราคาเท่าไหร่', 'ตกแต่งเหงือกแพงมั้ย', 'ทำฟันปลอมกี่บาท'] # test5 = ['จัดฟันต้องทำยังไง', 'ขอคำปรึกษาหน่อยค่า', 'ไม่ปรึกษา'] # test6 = ['ขอยกเลิกการนัด','อยากยกเลิกนัด','ไม่ทำแล้ว','ไม่อยากทำ'] # test7 = ['ไม่ว่างไปทำฟันในวันนัด', 'ย้ายวันนัด', 'ไม่ว่าง จะยกเลิก'] # test8 = ['ผ่อนชำระได้ไหมคะ', 'จ่ายบัตรเครดิตได้มั้ย'] # test9 = ['คลีนิกเปิดกี่โมง', 'คลีนิกเปิดวันไหนบ้าง'] # test10 = ['อาเระ', 'skfdsokf','ทำไมตอบได้แล้ว','วันนี้วันอะไร'] # for i in test10: # print('You : ' + i) # print('Bot : ' + question(i)) # print()
from model import NeuralNet # opening our intents.json file with open('intents.json', 'r') as f: intents = json.load(f) all_words = [] tags = [] pr = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) # loop over all the patterns for pattern in intent["patterns"]: w = tokenize(pattern) # no append another array just extend it all_words.extend(w) pr.append((w, tag)) Stop_words = stopwords.words('english') # stemming and removing unnecessary words all_words = [stem(w) for w in all_words if w not in Stop_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) #print("T = ",tags) # bag of words in X X_train = [] y_train = []