def test_tokenize_ideal(self): """ Ideal tokenize scenario """ expected = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy'] actual = tokenize('The weather is sunny, the man is happy.') self.assertEqual(expected, actual)
def test_tokenize_several_sentences(self): """ Tokenize text with several sentences """ expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence'] actual = tokenize('The first sentence. The second sentence.') self.assertEqual(expected, actual)
def test_tokenize_punctuation_marks(self): """ Tokenize text with different punctuation marks """ expected = ['the', 'first', 'sentence', 'nice', 'the', 'second', 'sentence', 'bad'] actual = tokenize('The, first sentence - nice. The second sentence: bad!') self.assertEqual(expected, actual)
def filter_(func, lst): l = pr.tokenize(lst) if l == [""]: return const.FALSE return apply(makelist, [ elm for elm in pr.mapevalconst(l) if pr.booltopy(functionisntace(func)(elm)) ])
def test_tokenize_dirty_text(self): """ Tokenize dirty text """ expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence'] actual = tokenize('The first% sentence><. The sec&*ond sent@ence #.') self.assertEqual(expected, actual)
def get_senses(probability_map, instance, window_size, penalty_score): sense_prob_map = {} # { s, P(s) } Probability map for each possible sense the instance can be map_of_senses = probability_map[instance[main.word_tag]] for sense in map_of_senses.keys(): sense_prob_map[sense] = (map_of_senses[sense])[0]; # set initial probability to P(s) # map of feature words and probabilities for the sense feature_map = (map_of_senses[sense])[1] sentence = instance[main.prev] + " " + main.target_token + " " + instance[main.next] list_of_features = main.tokenize(sentence) # feature words for the wordInstance list_of_features = supervisedModel.words_in_window(list_of_features, window_size) stopwordList = stopwords.words('english') punctuation = [".",",",":","'","?","$","-","\""] for p in punctuation: stopwordList.append(p) # remove all the stop words for item in list_of_features: if item in stopwordList: list_of_features.remove(item) # stem all the feature words for j in range(len(list_of_features)): list_of_features[j] = main.stem(list_of_features[j]) # remove duplicates in the sentence list_of_features = list(set(list_of_features)) # get all the p(f_j|s) from word instance and add it to the sense_prob_map for f in list_of_features: # if feature word exists in map, update the probability if f in feature_map.keys(): sense_prob_map[sense] *= feature_map[f] else: sense_prob_map[sense] *= penalty_score return sense_prob_map
def test_tokenize_big_text_case(self): """ Tokenize big input text scenario """ text = read_from_file('lab_1/tokens.txt') expected = text.split() actual = tokenize(text) self.assertEqual(expected, actual)
def test_tokenize_big_text_length_equal(self): """ Tokenize big input text and assert equal """ text = read_from_file('lab_1/tokens.txt') expected = len(text.split()) actual = len(tokenize(text)) self.assertEqual(expected, actual)
def test_tokenize_bad_input(self): """ Tokenize bad input argument scenario """ bad_inputs = [[], {}, (), None, 9, 9.34, True] expected = [] for bad_input in bad_inputs: actual = tokenize(bad_input) self.assertEqual(expected, actual)
def test_big_text_get_adjacent_words_term(self): """ Checks if adjacent words for a given term can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['although', 'products']] actual = get_adjacent_words(tokens, 'tex', 4, 31) self.assertEqual(expected, actual)
def FUNCTION(*arg): #ここで束縛されるargは実引数となる #dmyargは"(x , y , z)"みたいな仮引数 if dmyarg == "nil": lst = [pr.eval(each , scope) for each in body] else: s = pr.joindict({e:arg[i]for i,e in enumerate(pr.tokenize(dmyarg))},scope) lst = [pr.eval(each , s) for each in body] return lst.pop()
def FUNCTION(*arg): #ここで束縛されるargは実引数となる #dmyargは"(x , y , z)"みたいな仮引数 if dmyarg == "nil": lst = [pr.eval(each, scope) for each in body] else: s = pr.joindict( {e: arg[i] for i, e in enumerate(pr.tokenize(dmyarg))}, scope) lst = [pr.eval(each, s) for each in body] return lst.pop()
def update_feature_map(feature_map, word, item): #get all feature words sentence = item[main.prev]+" "+item[main.next] list_of_features = main.tokenize(sentence) #stem all words in the sentence for i in range(len(list_of_features)): list_of_features[i] = main.stem(list_of_features[i]) #remove duplicates in the sentence list_of_features = list(set(list_of_features)) for f in list_of_features: increment_map_value(feature_map, f)
def test_get_adjacent_words_several_contexts_big_text(self): """ Checks if adjacent words for a given term can be found in real text properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['epithelial', 'channels'], ['means', 'aluminate'], ['by', 'bicarbonate'], ['the', 'salt']] actual = get_adjacent_words(tokens, 'sodium', 1, 1) self.assertEqual(expected, actual)
def test_big_text_get_and_sort_concordance_term(self): """ Checks if a context sorts right for a given term and can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['although', 'less', 'compact', 'than', 'tex', 'the', 'xml', 'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable', 'and', 'allows', 'for', 'instant', 'display']] actual = sort_concordance(tokens, 'tex', 4, 14, True) self.assertEqual(expected, actual)
def test_get_concordance_several_contexts_big_text_right(self): """ Checks if contexts for a given term can be found in real text properly Taking into consideration right context """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['means', 'sodium', 'aluminate'], ['by', 'sodium', 'bicarbonate'], ['epithelial', 'sodium', 'channels'], ['the', 'sodium', 'salt']] actual = sort_concordance(tokens, 'sodium', 1, 1, False) self.assertEqual(expected, actual)
def test_big_text_get_concordance_term(self): """ Checks if a context for a given term can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['although', 'less', 'compact', 'than', 'tex', 'the', 'xml', 'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable', 'and', 'allows', 'for', 'instant', 'display', 'in', 'applications', 'such', 'as', 'web', 'browsers', 'and', 'facilitates', 'an', 'interpretation', 'of', 'its', 'meaning', 'in', 'mathematical', 'software', 'products']] actual = get_concordance(tokens, 'tex', 4, 31) self.assertEqual(expected, actual)
def evaluate(sentence, two_way_X, two_way_y, max_length=300): encoder_input = tf.cast(tf.convert_to_tensor([tokenize(two_way_X, sentence)]), tf.int64) start, end = two_way_y.get('<start>'), two_way_y.get('<end>') output = tf.convert_to_tensor([start]) output = tf.expand_dims(output, 0) output = tf.cast(output, tf.int64) for i in range(max_length): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) """ CHANGE START: decomment only one of them """ """ 1 - ORIGINAL""" predicted_id = tf.argmax(predictions, axis=-1) """ 2 - MODIFIED""" """ predicted_id_orig = tf.argmax(predictions, axis=-1) count = 0 while True: if count == 5: predicted_id = predicted_id_orig break predicted_id = tf.argmax(predictions, axis=-1) # concatentate the predicted_id to the output which is given to the decoder as its input. if check_next_syl(two_way_y, copy.deepcopy(predicted_id), output, sentence): break predictions = predictions.numpy() predictions[:, :, predicted_id.numpy()] = -100 predictions = tf.convert_to_tensor(predictions) count += 1 """ """ CHANGE STOP """ output = tf.concat([output, predicted_id], axis=-1) # return the result if the predicted_id is equal to the end token if predicted_id == end: break # output.shape (1, tokens) text = detokenize(two_way_y, output) return text, attention_weights
def last(lst): l = pr.tokenize(lst) #nilだったらnil return const.FALSE if l[0] == "" else l[len(l) - 1]
intents = json.load(f) # print(intents) import numpy as np import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from model import NeuralNet all_words = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) ignore_words = ['?', '!', '.', ',', '¿', '¡'] all_words = [stem(w) for w in all_words if w not in ignore_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) # print(tags) X_train = [] y_train = [] for (pattern_sentence, tag) in xy: bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag)
def init(lst): l = pr.tokenize(lst) return apply(makelist, l) if l[0] == "" else apply(makelist, l[0:len(l) - 1])
def map_(func, lst): l = pr.tokenize(lst) #nilだったらnil if l == [""]: return const.FALSE #tokenizeでばらした要素そのままでは文字列のままである return apply(makelist, map(functionisntace(func), pr.mapevalconst(l)))
""" Concordance implementation starter """ import os import main if __name__ == '__main__': current_dir = os.path.dirname(os.path.abspath(__file__)) data = main.read_from_file(os.path.join(current_dir, 'data.txt')) stop_words = main.read_from_file( os.path.join(current_dir, 'stop_words.txt')) tokens = main.tokenize(data) print(f'Raw text: {data[:5]}') print(f'Tokenized text: {tokens[:5]}') tokens = main.remove_stop_words(tokens, stop_words) print(f'Text without stop-words: {tokens[:5]}') frequencies = main.calculate_frequencies(tokens[:5000]) print(f'Frequencies: {frequencies[tokens[0]]}') word = 'dog' concordance = main.get_concordance(tokens, word, 2, 0) print(f'The concordance for {word}: {concordance[:5]}') adjacent = main.get_adjacent_words(tokens, 'dog', 2, 0) print(f'Adjacent words: {adjacent[:5]}') sorted_concordance = main.sort_concordance(tokens, 'dog', 2, 0, True)
def map_(func , lst ): l = pr.tokenize(lst) #nilだったらnil if l == [""] : return const.FALSE #tokenizeでばらした要素そのままでは文字列のままである return apply(makelist , map(functionisntace(func) , pr.mapevalconst(l)))
"tan" :(True , math.tan , False), "+" :(True , (lambda *arg: reduce((lambda x,y: x+y) , arg)) , False), "-" :(True , (lambda *arg: reduce((lambda x,y: x-y) , arg)) , False), "*" :(True , (lambda *arg: reduce((lambda x,y: x*y) , arg)) , False), "/" :(True , (lambda *arg: reduce((lambda x,y: x/y) , arg)) , False), ">" :(True , (lambda x,y: pr.booltolisp(x>y)) , False), ">=" :(True , (lambda x,y: pr.booltolisp(x>=y)), False), "<" :(True , (lambda x,y: pr.booltolisp(x<y)) , False), "<=" :(True , (lambda x,y: pr.booltolisp(x<=y)) , False), "=" :(True , (lambda x,y: pr.booltolisp(x == y)) , False), "and" :(True , (lambda x,y: pr.booltolisp(pr.booltopy(x) and pr.booltopy(y))) , False), "or" :(True , (lambda x,y: pr.booltolisp(pr.booltopy(x) or pr.booltopy(y))) , False), "not" :(True , (lambda x: pr.booltolisp(not pr.booltopy(x)) ) , False), "cons" :(True , (lambda x,y: apply(makelist,[x] + pr.tokenize(y))) , False), "car" :(True , (lambda x:pr.tokenize(x)[0]) , False), "cdr" :(True , (lambda x:apply(makelist ,pr.tokenize(x)[1:])) , False), "list" :(True , makelist , False), "last" :(True , last , False), "length" :(True , (lambda x: len(pr.tokenize(x))) , False), "init" :(True , init , False), "map" :(True , map_ , False), "filter" :(True , filter_ , False), "list?" :(True , (lambda x:pr.booltolisp(pr.W_islist(x))) , False), "atom?" :(True , (lambda x:pr.booltolisp(pr.W_isatom(x))) , False), "symbol?" :(True , (lambda x:pr.booltolisp(pr.W_issymbol(x))) , False), "null?" :(True , (lambda x:pr.booltolisp(pr.W_isnil(x))) , False), "equal?" :(True , (lambda x,y: pr.booltolisp(x == y)) , False),
def init(lst): l = pr.tokenize(lst) return apply(makelist , l) if l[0] == "" else apply(makelist , l[0:len(l)-1])
""" Concordance implementation starter """ import os import main if __name__ == '__main__': # use data.txt file to test your program current_dir = os.path.dirname(os.path.abspath(__file__)) data = main.read_from_file(os.path.join(current_dir, 'data.txt')) stop_words = main.read_from_file( os.path.join(current_dir, 'stop_words.txt')).split('\n') # here goes your logic: calling methods from concordance.py tokens = main.tokenize(data) print('tokens:', tokens[:10]) print('\n-----------------------------\n') tokens = main.remove_stop_words(tokens, stop_words) # old: 34 sec, new - 3.4 sec print('tokens without stop words:', tokens[:10]) print('\n-----------------------------\n') frequencies = main.calculate_frequencies( tokens) # old: 116 sec, new: ~81 sec print('frequency for the first word:', frequencies[tokens[0]]) print('\n-----------------------------\n') top_10 = main.get_top_n_words(frequencies, 10) print('top 10 words:', top_10)
def last(lst): l = pr.tokenize(lst) #nilだったらnil return const.FALSE if l[0] == "" else l[len(l)-1]
def filter_(func , lst): l = pr.tokenize(lst) if l == [""] : return const.FALSE return apply(makelist , [elm for elm in pr.mapevalconst(l) if pr.booltopy(functionisntace(func)(elm))])
tags = data['tags'] model_state = data['model_state'] model = NeuralNet(input_size, hidden_size, output_size).to(device) model.load_state_dict(model_state) model.eval() bot_name = "Hamdi" user_name = input("Merhaba, adın ne?: ") while True: sentence = input('{}: '.format(user_name)) if sentence == 'quit': break sentence = tokenize(sentence) X = bag_of_words(sentence, all_words) X = X.reshape(1, X.shape[0]) X = torch.from_numpy(X).to(device) output = model(X) _, predicted = torch.max(output, dim=1) tag = tags[predicted.item()] probs = torch.softmax(output, dim=1) prob = probs[0][predicted.item()] if prob.item() > 0.50: for intent in intents['intents']: if tag == intent['tag']:
import tqdm from nlp import load_dataset with open('intents.json', 'r') as f: intents = json.load(f) all_words = [] tags = [] match = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: tokenized_sentence = tokenize(pattern) all_words.extend(tokenized_sentence) match.append((tokenized_sentence, tag)) ignored_words = ['?', '.', '!'] all_words = [stem(w) for w in all_words if w not in ignored_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) match_1 = [] match_2 = [] for (pattern_sentence, tag) in match: bag = bag_of_words(pattern_sentence, all_words) match_1.append(bag)