class KerasTokenizer(): def __init__(self, vocab_size=None, oov_token="<OOV>"): self.vocab_size = vocab_size self.oov_token = oov_token @property def vocab(self): return self.tokenizer.word_index def fit(self, texts): self.tokenizer = Tokenizer(num_words=self.vocab_size, oov_token=self.oov_token) self.tokenizer.fit_on_texts(texts) def encode(self, text): if type(text) == str: return self.tokenizer.texts_to_sequences([text])[0] return self.tokenizer.texts_to_sequences(text) def decode(self, encoded_text): if not encoded_text: return "" if type(encoded_text[0]) == int: return self.tokenizer.sequences_to_texts([encoded_text])[0] return self.tokenizer.sequences_to_texts(encoded_text) def tokenize(self, text): if type(text) == str: return text.split() else: return [t.split() for t in text]
class Model: def __init__(self, text): self.sliding_token_size = 15 self.path = None self.model = None self.tokens = text.split() self.tokenizer = Tokenizer(filters='') self.tokenizer.fit_on_texts(self.tokens) self.x = numpy.zeros((len(self.tokens) - self.sliding_token_size, self.sliding_token_size)) self.y = numpy.zeros((len(self.tokens) - self.sliding_token_size, 1)) sequences = self.tokenizer.texts_to_sequences(self.tokens) for token_n in range(len(self.tokens) - self.sliding_token_size): for sliding_token_n in range(self.sliding_token_size): self.x[token_n][sliding_token_n] = sequences[token_n + sliding_token_n][0] self.y[token_n] = sequences[token_n + self.sliding_token_size][0] def save(self): pickle.dump(self.model, open(self.path + 'model.bin', 'wb')) pickle.dump(self.tokens, open(self.path + 'tokens.bin', 'wb')) pickle.dump(self.tokenizer, open(self.path + 'tokenizer.bin', 'wb')) def load(self): self.model = pickle.load(open(self.path + 'model.bin', 'rb')) self.tokens = pickle.load(open(self.path + 'tokens.bin', 'rb')) self.tokenizer = pickle.load(open(self.path + 'tokenizer.bin', 'rb')) def generate(self): text = random.choice(self.tokens) for _ in range(100): sequences = pad_sequences([self.tokenizer.texts_to_sequences([text])[0]], self.sliding_token_size) text += ' ' + self.tokenizer.sequences_to_texts([self.model.predict_classes(sequences)])[0] return text
def get_result(max_length): x_test = get_text("data/processed" + "/X_test.txt") x_train = get_text("data/processed" + "/X_train.txt") y_test = np.loadtxt("data/processed" + "/y_test.txt", dtype=int) y_train = np.loadtxt("data/processed" + "/y_train.txt", dtype=int) # word index from 1 tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(x_train) x_train_cut_num = tokenizer.texts_to_sequences(x_train) x_test_cut_num = tokenizer.texts_to_sequences(x_test) x_train_cut_num_pad = pad_sequences(x_train_cut_num, padding="post", maxlen=max_length, value=4) x_test_cut_num_pad = pad_sequences(x_test_cut_num, padding="post", maxlen=max_length, value=4) x_train_cut_text = tokenizer.sequences_to_texts(x_train_cut_num_pad) x_test_cut_text = tokenizer.sequences_to_texts(x_test_cut_num_pad) nb_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB(fit_prior=True, class_prior=None))]) nb_total = 0 nb_result_list = [] for i in range(y_test.shape[1]): nb_pipeline.fit(x_train_cut_text, y_train[:, i]) nb_predict = nb_pipeline.predict(x_test_cut_text) nb_result_list.append(nb_predict) nb_total += np.sum( [y_test[j, i] == nb_predict[j] for j in range(y_test.shape[0])]) nb_result_reshape = np.array(nb_result_list).reshape( y_test.shape[0], y_test.shape[1]) total_num = y_test.shape[0] * y_test.shape[1] print("navie bayes accuracy: ") print(nb_total / total_num) print("F1 score: ") print( precision_recall_fscore_support(y_test, nb_result_reshape, average='macro')) print("roc score: ") print(roc_auc_score(y_test, nb_result_reshape))
class Preprocessor: def __init__(self, cache_path=None, stop_words=None, **extra): if cache_path and os.path.exists(cache_path): with open(cache_path, 'r') as f: self._tk = tokenizer_from_json(f.read()) else: self._tk = Tokenizer(lower=True, **extra) self._cache_path = cache_path def fit(self, data): self._tk.fit_on_texts(data) def save(self): filename = self._cache_path with open(filename, 'w') as f: f.write(self._tk.to_json()) def transform(self, data: pd.Series, truncate: Union[str, int] = 'median'): """Transform a list of Series of texts into a list of Series of vectors""" seq = self._tk.texts_to_sequences(data) lens = [len(vec) for vec in seq] logging.info( f'median {np.median(lens)}, mean {np.mean(lens)}, max {np.max(lens)}, min {np.min(lens)}' ) if truncate == 'median': text_len = int(np.median(lens)) else: text_len = truncate logging.info(f'Transforming texts into vectors with {text_len} size') return pd.Series( pad_sequences(seq, padding='post', maxlen=text_len).tolist()) def to_text(self, data): """Transform a vector back to text Arguments: data {list} -- ndarray or pd.Series """ return self._tk.sequences_to_texts(data)
def build_co_occurence(self, word_index, corpus, window_size): # Cleaning the corpus tk = Tokenizer() tk.fit_on_texts(corpus) corpus = tk.texts_to_sequences(corpus) corpus = tk.sequences_to_texts(corpus) vocab_size = len(word_index) + 1 idx_to_word = {word_index[word]:word for word in word_index if word in word_index} # Collecting indices as a sparse matrix self.cooccurences = sparse.lil_matrix((vocab_size, vocab_size), dtype = np.float64) print(f"Shape of coocc = {self.cooccurences.shape}") # Get the tokenized sequence * TODO implement with tokenizer for i, line in enumerate(corpus): # TODO add progress bar print(f"\rForming the Co-Occurence Matrix : {(100*(i+1 )/len(corpus)):0.2f}%", end = "") sys.stdout.flush() tokens = line.strip().split() token_ids = [word_index[word.lower()] for word in tokens if word.lower() in word_index] # extracting context words to the left for center_i, center_id in enumerate(token_ids): context_ids = token_ids[max(0, center_i - window_size): center_i] contexts_len = len(context_ids) # Adding to the coocc matrix for left_i, left_id in enumerate(context_ids): dist = contexts_len - left_i inc = 1/float(dist) self.cooccurences[center_id, left_id] += inc self.cooccurences[left_id, center_id] += inc print() print(f"Generated co-occurence matrix of shape {self.cooccurences.shape}") return self.cooccurences
seed_text_rtl = " ".join(word for word in ngram[1]) print(seed_text_ltr, "->", current_word, "->", seed_text_rtl) token_list = tokenizer.texts_to_sequences([seed_text_ltr])[0] token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') token_list_rev = tokenizer.texts_to_sequences([seed_text_rtl])[0] token_list_rev = pad_sequences([token_list_rev], maxlen=max_sequence_len - 1, padding='pre') predicted_id = np.argmax(model.predict([token_list, token_list_rev]), axis=-1) predicted_word = tokenizer.sequences_to_texts([predicted_id])[0] print(predicted_word) predicted_probs = model.predict([token_list, token_list_rev]) predicted_best = np.argsort(-predicted_probs, axis=-1)[0][:4500] suggestions = [] correct = None for prob in predicted_best: output_word = tokenizer.sequences_to_texts([[prob]])[0] ed = nltk.edit_distance(current_word, output_word) if ed == 0: print("I got this one; it seems correct -->", current_word, "=", output_word)
def map_to_string(cls, input_vector: List, tokenizer: Tokenizer) -> List: ''' Map a given vector to an unpadded string. ''' return tokenizer.sequences_to_texts(input_vector)
nlp = spacy.load('en_core_web_lg') with open('./data/great_expectation.txt', 'r', encoding='utf-8-sig') as f: text_chunks = f.read().replace('\n', ' ').split('.') LINES = 1000 LINES = len(text_chunks) for i in tqdm(range(LINES)): #line = tokenize_line(text_chunks[i]) line = text_chunks[i].strip().lower() line = re.sub(F, ' ', line) if len(line) <= 0: continue line = tokenize_line(line) tokenizer.fit_on_texts(line) seq = tokenizer.texts_to_sequences(line) txt = tokenizer.sequences_to_texts(seq) toks = nlp(' '.join(txt)) for s, t in zip(seq, toks): print(t, t.pos, s[0]) exit() with open('./tokenizer.pickle', 'wb') as f: pickle.dump(tok, f) print('word count: ', len(tok.word_counts)) print(tok.word_counts) print('count = ', count)
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + additional_filters, lower=True, split=" ", char_level=False, oov_token="UNK", document_count=0) token.fit_on_texts(sentence_data) tokenizer_config = token.get_config() print(tokenizer_config.keys()) #print(tokenizer_config["word_index"]) #print("\n\n\n\n\n\n\n") import json word_counts = json.loads(tokenizer_config['word_counts']) #print(word_counts) print(word_counts["the"]) index_word = json.loads(tokenizer_config['index_word']) word_index=json.loads(tokenizer_config["word_index"]) #print(sentence_data) print(sentence_data[:5]) sentence_seq = token.texts_to_sequences(sentence_data) print(sentence_seq[0:5]) senetn = token.sequences_to_texts(sentence_seq) print(senetn[:5])
class Pungen: def __init__(self, **kwargs): self.filepath = kwargs.get('filepath') self.embedding_layer = None def _parse_corpus(self, min_seq_len, filepath): print('Indexing word vectors.') self.texts = [] with open(filepath, encoding='utf-8') as fp: for line in fp: if line == "\n": continue self.texts.append(line) self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters=TOKEN_FILTER) self.tokenizer.fit_on_texts(self.texts) self.sequences = self.tokenizer.texts_to_sequences(self.texts) self.sequences = [x for x in self.sequences if len(x) >= min_seq_len] self.word_index = self.tokenizer.word_index print('Found %s unique tokens.' % len(self.word_index)) print('Found %s texts.' % len(self.sequences)) def prepare_emb(self, emb_dim, input_length): print('Indexing word vectors.') emb_name = 'glove.6B.' + str(emb_dim) + "d.txt" self.embeddings_index = {} with open(os.path.join(GLOVE_DIR, emb_name), encoding='utf-8') as f: for line in f: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, 'f', sep=' ') self.embeddings_index[word] = coefs print('Found %s word vectors.' % len(self.embeddings_index)) # prepare embedding matrix num_words = MAX_NUM_WORDS self.embedding_matrix = np.zeros((num_words, emb_dim)) for word, i in self.word_index.items(): if i >= num_words: continue embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed self.embedding_layer = Embedding(num_words, emb_dim, embeddings_initializer=Constant( self.embedding_matrix), input_length=input_length, trainable=False) def check_generator(self): texts = self.tokenizer.sequences_to_texts(self.sequences) if len(texts) != len(self.texts): print("Different sizes of texts") return filter = set(TOKEN_FILTER) for i in range(len(texts)): if texts[i].lower() != self.texts[i][:-1].lower(): if any((c in filter) for c in self.texts[i][:-1].lower()): continue print(texts[i], self.texts[i][:-1]) print(self.texts[i][:-1].lower()) print("Tokenizer failed to tokenize properly!") return print("Tokenizer check was succesfull!") def form_pun(self, eval_path): retrieve = Retrieve(sentence_path=TEXT_DATA_DIR + TEXT_DATA, pun_path=PUN_DATA_DIR + PUN_DATA) (pun, sentence, score) = retrieve.retrieve() if not sentence: print("No sentence with word {} was found. Exiting...".format( pun[1])) raise Exception() text = word_tokenize(sentence) tokenized = nltk.pos_tag(text) print(tokenized) print(sentence, pun[0], pun[1]) pre = self.tokenizer.texts_to_sequences([sentence]) wp = self.tokenizer.texts_to_sequences([pun[0]]) wa = self.tokenizer.texts_to_sequences([pun[1]]) if (not wa[0]) or (not wp[0]): print( "The pair of pun and word does not exist in the parsed corpus. Exit..." ) raise Exception() index_wa = -1 for seq in pre[0]: index_wa = index_wa + 1 if seq == wa[0][0]: pre[0][index_wa] = wp[0][0] break wordsimilarity = WordSimilarity() wordsimilarity.word2vec() wordsimilarity.load() try_limit = 5 try_count = 0 index_topic = 0 while True: try: topic_word = None for i in range(index_topic, len(tokenized)): (word, pos) = tokenized[i] if (pos == 'NNP'): topic_word = "man" print(word, pos) index_topic = index_topic + 1 break if (pos == 'NN') or (pos == 'PRP') or (pos == 'NNS') or ( pos == 'PRP$'): topic_word = word print(word, pos) index_topic = index_topic + 1 break index_topic = index_topic + 1 result = wordsimilarity.getSimilar([topic_word, pun[0]], [pun[1]], 10) other_result = wordsimilarity.getSimilar([pun[0]], [], 10) break except KeyError: print("Word {} is not in vocabulary, try with the next one". format(topic_word)) try_count = try_count + 1 if try_limit == try_count: print("Limit of trys has been reached. Exit...") raise Exception() eval_surprisal = Evaluate() eval_surprisal.load_model(eval_path) finals = [] mean_amalgam = 0 for (word, prob) in result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_amalgam = mean_amalgam + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) finals.append(post_smoothing) print(post_smoothing) print(finals) print(mean_amalgam / 10) other_finals = [] mean_similar = 0 for (word, prob) in other_result: swap = self.tokenizer.texts_to_sequences([word]) context_window = 2 surprise = eval_surprisal.compute_surpisal( sentence=pre[0], pun_word=wa[0][0], pun_alternative=wp[0][0], context_window=context_window) mean_similar = mean_similar + surprise print(surprise) pre[0][index_topic] = swap[0][0] post_simple = self.tokenizer.sequences_to_texts([pre[0]]) print(post_simple) pre[0][index_topic + 1] = 0 if index_topic >= 2: pre[0][index_topic - 1] = 0 post_smoothing = self.dac.inference(pre[0]) post_smoothing = self.tokenizer.sequences_to_texts( post_smoothing.tolist()) other_finals.append(post_smoothing) print(post_smoothing) print(other_finals) print(mean_similar / 10) return finals.extend(other_finals) def train_predict_model(self, model_params): predict_word = WordPredict(max_len=MAX_LEN, max_words=MAX_NUM_WORDS, emb_layer=self.embedding_layer) predict_word.build_model(**model_params) predict_word.compile_model(model_params) generator = Generator(sequences=self.sequences, batch_size=PREDICT_BS, max_words=MAX_NUM_WORDS, max_len=MAX_LEN, split=PREDICT_SPLIT) predict_word.train(generator, PREDICT_BS, PREDICT_SPLIT, PREDICT_EPOCHS) return predict_word def load_predict_model(self, path): predict_word = load_model(path) return predict_word def train_dac_model(self, model_params): dac = DAC() smoother_model = dac.build_model(hidden_sizes=[64, 64], seq_len=50, no_words=40000, emb_layer=self.embedding_layer, lr=0.01) generator = Generator(sequences=self.sequences, batch_size=SMOOTH_BS, max_words=MAX_NUM_WORDS, max_len=MAX_LEN, split=SMOOTH_SPLIT) smoother_model = dac.train(generator, full_model=smoother_model, model_params=model_params, bs=SMOOTH_BS, split=SMOOTH_SPLIT, pretrain_epochs=4, epochs=SMOOTH_EPOCHS) def run(self, predict_path, smoother_path, eval_path): self._parse_corpus(MIN_SEQ_LEN, TEXT_DATA_DIR + TEXT_DATA) self.prepare_emb(EMBEDDING_DIM, MAX_LEN) predict_model = None if predict_path is None: model_params = { 'lstm': [16], 'merge_layer': 'concat', 'dense': { 'size': [64, 32], 'act': 'elu', 'dropout': 0 }, 'optimizer': 'adam', 'lr': 0.0005 } predict_model = self.train_predict_model(model_params) else: pass #predict_model = self.load_predict_model(predict_path) #smoother_model = None if smoother_path is None: model_params = {'size': [64, 64], 'lr': 0.01} #smoother_model = self.train_dac_model(model_params) else: self.dac = DAC() self.dac.load_model(smoother_path) #GENERATE PUN while True: try: final = pungen.form_pun(eval_path) break except Exception: pass print(final)
class DDTokenizer: def __init__(self, num_words, oov_token='<UNK>'): self.tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token, filters='!"#$%&*+,-./:;<>?\\^_`{|}~\t\n', char_level=True, lower=False) self.has_trained = False self.pad_type = 'post' self.trunc_type = 'post' # The encoded data self.word_index = {} def fit(self, train_data): # Get max training sequence length print("Training Tokenizer...") self.tokenizer.fit_on_texts(train_data) self.has_trained = True print("Done training...") # Get our training data word index self.word_index = self.tokenizer.word_index def encode(self, data, use_padding=True, padding_size=None, normalize=False): # Encode training data sentences into sequences train_sequences = self.tokenizer.texts_to_sequences(data) # Get max training sequence length if there is none passed if padding_size is None: maxlen = max([len(x) for x in train_sequences]) else: maxlen = padding_size if use_padding: train_sequences = pad_sequences(train_sequences, padding=self.pad_type, truncating=self.trunc_type, maxlen=maxlen) if normalize: train_sequences = np.multiply(1 / len(self.tokenizer.word_index), train_sequences) return train_sequences def pad(self, data, padding_size=None): # Get max training sequence length if there is none passed if padding_size is None: padding_size = max([len(x) for x in data]) padded_sequence = pad_sequences(data, padding=self.pad_type, truncating=self.trunc_type, maxlen=padding_size) return padded_sequence def decode(self, array): assert self.has_trained, "Train this tokenizer before decoding a string." return self.tokenizer.sequences_to_texts(array) def test(self, string): encoded = list(self.encode(string)[0]) decoded = self.decode(self.encode(string)) print("\nEncoding:") print("{original} -> {encoded}".format(original=string[0], encoded=encoded)) print("\nDecoding:") print("{original} -> {encoded}".format(original=encoded, encoded=decoded[0].replace( " ", ""))) def get_info(self): return self.tokenizer.index_word
if __name__ == '__main__': shakespeare_url = "https://homl.info/shakespeare" filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url) with open(filepath) as f: shakespeare_text = f.read() # Then Encode char to integer # Use Tokenizer class: allow to vectorize text corpus by tuning each text to sequence of integer # or into a vector tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) tokenizer.fit_on_texts([shakespeare_text]) tokenizer.texts_to_sequences(["First"]) tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]]) max_id = len(tokenizer.word_index) # number of distinct chars # dataset_size = tokenizer.document_count # total number of chars [encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 dataset_size = encoded.shape[0] train_size = dataset_size * 90 // 100 # Slice for get 90% to dataset a = encoded[:train_size] dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size]) n_steps = 100 window_length = n_steps + 1 # target = input shifted 1 character ahead # Window method create several windows with length = 101, 1st window contain 0 -> 100 # second one contain 1-> 101, then flatten all of window
class CrimData: def __init__(self, args): # embeddings model self.w2v = args['w2v'] # training tuples train = args['train'] # test tuples test = args['test'] # validation tuples validation = args['validation'] # synonyms self.synonyms = args['synonyms'] # if set to -1 then we will use full vector space vocab # otherwise use indicated size self.limited_vocab_n = args['limited_vocab_n'] if self.limited_vocab_n > -1: print("Creating limited vocabulary of %d" % (self.limited_vocab_n)) # collect words for exercise flat_synonym = [word for v in self.synonyms.values() for word in v] hyponyms = list(set([x for x, y in train + test + validation])) hypernyms = list(set([y for x, y in train + test + validation])) # dataset set vocab vocab = list(set(hyponyms + hypernyms + flat_synonym)) vocab_len = len(vocab) print("Dataset vocabulary size is %d" % (vocab_len)) model_words = list(self.w2v.vocab.keys()) # sample words from vector space; sample more words than requested to handle collisions with dataset words random_words = np.random.choice(model_words, (self.limited_vocab_n + 10000), replace=False) vocab = vocab + [ w for w in random_words.tolist() if w not in vocab ][:self.limited_vocab_n - vocab_len] print("Truncated vocab length is %d" % (len(vocab))) else: # choose all words in vector space vocab = list(self.w2v.vocab.keys()) # create tokenizer from embeddings model self.tokenizer = Tokenizer(filters='', lower=False) # fit on vocab self.tokenizer.fit_on_texts(vocab) print("Vocab size is %d words" % (len(self.tokenizer.index_word))) # initialise negative word sampler print("Initialising negative sampler") self.negative_sampler = make_sampler( list(self.tokenizer.word_index.values())) print("Tokenising all dataset tuples") # tokenize dataset -> convert to numbers which will serve as embeddings lookup keys self.all_data_token = self.tokenizer.texts_to_sequences( [[x, y] for x, y in train + test + validation]) # create hypernym dictionary lookup self.hypernym_id_lookup = defaultdict(list) for x, y in self.all_data_token: self.hypernym_id_lookup[x].append(y) # disable default factory self.hypernym_id_lookup.default_factory = None print("Creating embeddings matrix") # create embeddings matrix self.embeddings_matrix = np.zeros( (len(self.tokenizer.index_word) + 1, 300)) for k, v in self.tokenizer.index_word.items(): self.embeddings_matrix[k] = self.w2v[v] #vectors should already by nornalised #self.embeddings_matrix[k] /= np.linalg.norm(emb_matrix[k]) print("Done!") # get list of padded synonyms def sample_synonyms(self, word_id, sample_length): # convert word_id to word to look for in synyony dictionary word = self.tokenizer.index_word[word_id] if word in self.synonyms: _syn = self.synonyms[word] else: _syn = [] # convert list to embeddings index array syn_list = np.asarray(self.tokenizer.texts_to_sequences([_syn])[0]) result = np.asarray([]) # if we have enough synonyms, we can randomly sample length-1 from list and add the hyponym itself to # the list if (sample_length > 1 and len(syn_list) >= (sample_length - 1)): result = np.random.choice(syn_list, sample_length - 1, replace=False) result = np.append(result, word_id) # otherwise, we pick all synyonyms and pad the sequences to match model fixed-input else: result = np.append(syn_list, word_id) result = pad_sequences([result], sample_length, padding='post', value=word_id) # we're expecting 1-D vector return result.flatten() def get_negative_random(self, word_id, neg_count): neg_samples = [] while len(neg_samples) < neg_count: tmp_neg = next(self.negative_sampler) if tmp_neg not in self.hypernym_id_lookup[word_id]: neg_samples.append(tmp_neg) return neg_samples def get_augmented_batch(self, query_batch, neg_count, syn_count): # create synonym equivalent in ids, prepending the hyponym to the list of synonyms query_input = np.zeros((len(query_batch) * (neg_count + 1), 1), dtype='int32') hyper_input = np.zeros((len(query_batch) * (neg_count + 1), 1), dtype='int32') synonym_input = np.zeros( (len(query_batch) * (neg_count + 1), syn_count), dtype='int32') y_input = np.zeros(len(query_batch) * (neg_count + 1)) for idx, (query, hyper) in enumerate(query_batch): query_input[idx * (neg_count + 1)] = np.asarray(query) hyper_input[idx * (neg_count + 1)] = np.asarray(hyper) synonym_input[idx * (neg_count + 1)] = self.sample_synonyms( query, syn_count) y_input[idx * (neg_count + 1)] = 1 if neg_count > 0: negatives = self.get_negative_random(word_id=query, neg_count=neg_count) for m, neg in enumerate(negatives): query_input[(idx * (neg_count + 1)) + (m + 1)] = np.asarray(query) hyper_input[(idx * (neg_count + 1)) + (m + 1)] = np.asarray(neg) synonym_input[(idx * (neg_count + 1)) + (m + 1)] = self.sample_synonyms( query, syn_count) return query_input, hyper_input, synonym_input, y_input def token_to_words(self, dataset): _q = self.tokenizer.sequences_to_texts(dataset[:, 0].reshape(-1, 1)) _h = self.tokenizer.sequences_to_texts(dataset[:, 1].reshape(-1, 1)) return list(zip(_q, _h))
# Tokenizer X_tokenizer = Tokenizer(filters=args.filters, lower=args.lower, char_level=args.char_level, oov_token='<UNK>') X_tokenizer.fit_on_texts(X_train) vocab_size = len(X_tokenizer.word_index) + 1 # +1 for padding token config.logger.info(f"→ vocab_size: {vocab_size}") # Convert texts to sequences of indices original_text = X_train[0] X_train = np.array(X_tokenizer.texts_to_sequences(X_train)) X_val = np.array(X_tokenizer.texts_to_sequences(X_val)) X_test = np.array(X_tokenizer.texts_to_sequences(X_test)) preprocessed_text = X_tokenizer.sequences_to_texts([X_train[0]])[0] config.logger.info("→ Text to indices:\n" f" (raw) → {original_text}\n" f" (preprocessed) → {preprocessed_text}\n" f" (tokenized) → {X_train[0]}") # Label encoder y_tokenizer = LabelEncoder() y_tokenizer = y_tokenizer.fit(y_train) classes = y_tokenizer.classes_ config.logger.info("→ classes:\n" f" {classes}") # Convert labels to tokens class_ = y_train[0] y_train = y_tokenizer.transform(y_train) y_val = y_tokenizer.transform(y_val)
# sequences greater than 100 in length will be truncated MAX_SEQ_LENGTH = 100 X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post") Y_padded = pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post") # print the first sequence print(X_padded[0], "\n" * 3) print(Y_padded[0]) X, Y = X_padded, Y_padded Y = to_categorical(Y) TEST_SIZE = 0.10 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=4) # Re-evaluate the model # loss, acc = model.evaluate(X_test,Y_test, verbose=2) input = X_padded[0].reshape((1, 100)) print(tag_tokenizer.sequences_to_texts(np.argmax(model.predict(input), axis=2))) print(tag_tokenizer.sequences_to_texts(Y_padded)[0])
def vschat_service(request): # text 입력 받은 후 if request.method == 'POST': # input1 받아옴 + 모델 탑재하고 라벨과 쿼리 받아오기 input1 = request.POST['input1'] okt = Okt() max_len = 40 vocab_size = 515 tokenizer = Tokenizer() with open('./static/word_dict_ver03.json') as json_file: word_index = json.load(json_file) tokenizer.word_index = word_index # print(tokenizer.word_index) tokenized_sentence = [] temp_X = okt.morphs(input1, stem=True) # 토큰화 tokenized_sentence.append(temp_X) print(tokenized_sentence) input_data = tokenizer.texts_to_sequences(tokenized_sentence) print(input_data) input_data = pad_sequences(input_data, maxlen=max_len) # padding loaded_model = load_model('./static/best_model_ver_relu_epc500.h5') prediction = loaded_model.predict(input_data) print(prediction) print("label: ", np.argmax(prediction[0])) label = str(np.argmax(prediction[0])) if label == '1': query = "select * from stepcountData where saved_time BETWEEN date('now', '-7 days', 'localtime') AND date('now', 'localtime');" elif label == '2': query = "select * from stepcountData where saved_time BETWEEN date('now', '-35 days', 'localtime') AND date('now', 'localtime');" elif label == '3': query = "select * from stepcountData where saved_time BETWEEN date('now', '-4 months','start of month', 'localtime') AND date('now', '+1 days', 'localtime');" else: with open('./static/tokenizer_for_attention.json') as f: data = json.load(f) tokenizer = tokenizer_from_json(data) # 모델 생성 model = Seq2seq(sos=tokenizer.word_index['\t'], eos=tokenizer.word_index['\n']) model.load_weights("./static/attention_ckpt/attention_ckpt") # Implement algorithm test @tf.function def test_step(model, inputs): return model(inputs, training=False) tmp_seq = [" ".join(okt.morphs(input1))] print("tmp_seq : ", tmp_seq) test_data = list() test_data = tokenizer.texts_to_sequences(tmp_seq) print("tokenized data : ", test_data) prd_data = tf.keras.preprocessing.sequence.pad_sequences( test_data, value=0, padding='pre', maxlen=128) prd_data = tf.data.Dataset.from_tensor_slices(prd_data).batch( 1).prefetch(1024) for seq in prd_data: prediction = test_step(model, seq) predicted_seq = tokenizer.sequences_to_texts( prediction.numpy()) print(predicted_seq) print("predict tokens : ", prediction.numpy()) predicted_seq = str(predicted_seq[0]).replace(" _ ", "_") predicted_seq = predicted_seq.replace("e (", "e(") predicted_seq = predicted_seq.replace("' ", "'") predicted_seq = predicted_seq.replace(" '", "'") predicted_seq = predicted_seq.replace(" - ", "-") predicted_seq = predicted_seq.replace("+ ", "+") predicted_seq = predicted_seq.replace("- ", "-") print(predicted_seq) query = "select * from stepcountData where " + predicted_seq + ";" # if not legend_value or xValue or yValue or response: legend_value.clear() xValue.clear() yValue.clear() response.clear() x1.clear() x2.clear() y1.clear() y2.clear() print(legend_value, xValue, yValue, response, x1, x2, y1, y2) try: if label == "2": show_weeks_avg(query) print("주별 평균") elif label == "3": show_months_avg(query) print("월별 평균") elif label == "6": if check_week(query) == True: show_by_week(query) print('주별비교') else: show_by_month(query) print('월별비교') else: show_barchart(query) print('바차트') except HTTPError as e: print("httperror") print("데이터를 불러올 수 없습니다. 텍스트를 다시 입력하세요") except IndexError as e: print("indexerror") print("데이터를 불러올 수 없습니다. 텍스트를 다시 입력하세요") # 예외처리에 대한 알림 메세지 어떻게 출력하는지 보기 # 딕셔너리에 저장(응답, 쿼리 결과 저장 변수, 라벨) output = dict() if not output: # output['response'] = response output['response'] = "그래프가 출력되었습니다" output['xValues'] = xValue output['yValues'] = yValue output['label'] = label output['legend_value'] = legend_value print(output) else: del output output['response'] = response utput['response'] = "그래프가 출력되었습니다" output['xValues'] = xValue output['yValues'] = yValue output['label'] = label output['legend_value'] = legend_value print(output) print("-----------------------------------------") print("-----------------------------------------") return HttpResponse(json.dumps(output), status=200) else: return render(request, 'chat.html')
# Procesamos data con la red neuronal y decodifcamos el intents try: # PROCESAMIENTO DE LA RED NEURONAL # tokenizar por palabras token = text_to_word_sequence(data) # obtener la secuencia seq = tok.texts_to_sequences(token) # encode la secuencia encoded = np.add.reduce(to_categorical(seq, size)) pred = model.predict(np.array([encoded])) seq2 = np.argmax(pred, axis=None, out=None) intent_get = tok2.sequences_to_texts(np.array([[seq2]])) intent_get = intent_get[-1] if spread: print("#" * 20 + " SUMMARY " + "#" * 20) print("\t Context: {0}".format(context)) print("\t Hope: {0}".format(hope)) print("\t Data: {0}".format(data)) print("\t Bag: {0}".format(bag)) print("\t Intent: {0}".format(intent_get)) print("#" * 49) time.sleep(5) # CONTEXTO CONVERSATION if hope and context in ["conversation"] and data not in [""]: hope = False
class Preprocessor: def __init__(self, dataset_name): self.name = dataset_name self.path = os.path.dirname(__file__) + '\\prepared-datasets\\' + self.name + '.pkl' self.eos_token = '<eos>' self.sos_token = '<sos>' self.tokenizer = None if not os.path.exists(self.path): print('loading dataset to disk. this may take a minute or two...') raw_dataset = self._load_dataset_from_tensorflow() ready_to_save_data = self._prepare_dataset_for_saving(raw_dataset) self._save_prepared_data(ready_to_save_data) print('dataset loaded to disk.') def _load_dataset_from_tensorflow(self): dataset = tfds.load(self.name, as_supervised=True, split=['train', 'test', 'validation']) return dataset def _prepare_dataset_for_saving(self, raw_data): # raw_data is a tuple (train_data, test_data, validation_data) train_data = self._prepare_data_as_supervised(raw_data[0]) test_data = self._prepare_data_as_supervised(raw_data[1]) validation_data = self._prepare_data_as_supervised(raw_data[2]) return train_data, test_data, validation_data def _prepare_data_as_supervised(self, data): # data is a list_like of (input_text, target_text) elements # this function returns a list of (input_text, target_text) elements to_be_returned = [] for sample in data: input_word_sequence = (self.sos_token + ' ' + sample[0] + ' ' + self.eos_token).numpy().decode('ASCII', 'ignore') target_word_sequence = (self.sos_token + ' ' + sample[1] + ' ' + self.eos_token).numpy().decode('ASCII', 'ignore') to_be_returned.append((input_word_sequence, target_word_sequence)) return to_be_returned def _save_prepared_data(self, data): data_holder = self._DataHolder(data) self._save(data_holder) def _save(self, data_holder): with open(self.path, 'wb') as f: pickle.dump(data_holder, f, protocol=pickle.HIGHEST_PROTOCOL) f.close() def _load(self, split): data_holder = None with open(self.path, 'rb') as f: data_holder = pickle.load(f) f.close() return data_holder.get_data(split) def load_preprocessed_data(self, split, vocab_size, max_input_len=400, max_target_len=150, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding='post'): ''' split: a string with value 'train' or 'test' or 'validation' vocab_size: the size of vocabulary to be used for tokenization we use the most frequent vocab_size - 1 words. batch_size: batch size to be used by the DataLoaders during trainning. max_input_len: the max length of input sequences, all sequences that are longer will be excluded from the dataset (along with thier targets!!), default = 400. max_target_len: the max length of target sequences, all sequences that are longer will be excluded from the dataset (along with thier inputs!!), default = 150. filters: a string of characters to be filtered out from texts, default = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\\t\\n'. to_lower: whether to convert all text to lower during tokenization or not, default = True. padding: whether to use padding at the end ('post') or at the beggining of sequence ('pre'), or not use padding at all (None), default = 'post'. ''' assert type(split) == str if (split == 'train'): self.tokenizer = Tokenizer(num_words=vocab_size, filters=filters, lower=to_lower, oov_token='<unk>', split=' ') data = self._load(split) raw_data = data # ._load() returns either train, test or validation data depending on split # we also need the raw data for experiments, comparison and visualization later data = self._to_2_lists(data)#: (List, List) # each data set is converted from list of 2-elements tuples to a tuple of 2 lists if (split == 'train'): self.tokenizer.fit_on_texts(data[0]) ''' we only fit the tokenizer on training inputs by 'fit' we mean contructing the vocab and initializing the tokenizer for later use ''' assert not (self.tokenizer == None) data = (self.tokenizer.texts_to_sequences(data[0]), self.tokenizer.texts_to_sequences(data[1]))#: a tuple of 2 lists of sequences # using the tokenizer, we convert raw text samples into integer sequences data = self._filter_by_lengths(data, max_input_len, max_target_len)#: tuple of 2 lists of sequences ''' we removed long inputs and targets based on: max_input_len, max_target_len ''' if not (padding == None): data = self._as_supervised_pad_list_of_sequences(data, padding=padding)#: a list of 5 tenors ''' we replaced the 2 lists of inputs targets with 5 tensors: (x_encoder, x_decoder, y, encoder_lengths, decoder_lengths) where x_encoder is the input of the encoder, x_decoder is the input of the decoder, y is the target. encoder_lengths and decoder_lengths are the lengths of non-padded input sequences for encoder and decoder. we need them for training later. they are used with pack_padded_sequence() and pad_packed_sequence() methods in pytorch ''' # finally we create data loaders for training and evaluation loops return data, raw_data def _to_2_lists(self, data): inputs = [] targets = [] for sample in data: inputs.append(sample[0]) targets.append(sample[1]) return inputs, targets def _filter_by_lengths(self, data, max_input_len, max_target_len): length = len(data[0]) i = 0 while i < length: if (len(data[0][i]) > max_input_len) or (len(data[1][i]) > max_target_len): data[0].pop(i) data[1].pop(i) length -= 1 i -= 1 i += 1 return data def _as_supervised_pad_list_of_sequences(self, data, padding='post'): inputs = data[0] decoder_inputs = [t[:-1] for t in data[1]] # we excluded the end of string token from the decoder_inputs targets = [t[1:] for t in data[1]] # we excluded the start of string token from the targets encoder_lengths = torch.LongTensor(self._get_lengths_from_sequences(inputs)) decoder_lengths = torch.LongTensor(self._get_lengths_from_sequences(targets)) inputs = torch.from_numpy(pad_sequences(inputs, padding=padding)).type(torch.LongTensor) targets = torch.from_numpy(pad_sequences(targets, padding=padding)).type(torch.LongTensor) decoder_inputs = torch.from_numpy(pad_sequences(decoder_inputs, padding=padding)).type(torch.LongTensor) to_be_returned = [inputs, decoder_inputs, targets, encoder_lengths, decoder_lengths] return to_be_returned def _get_lengths_from_sequences(self, sequences): to_be_returned = [] for seq in sequences: to_be_returned.append(len(seq)) return to_be_returned def _create_data_loader(self, split, vocab_size, batch_size, max_input_len=400, max_target_len=150, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding='post', shuffle=True, num_workers=6): data = self._SummarizationDataset(self, split, vocab_size, max_input_len=max_input_len, max_target_len=max_target_len, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding=padding) data = DataLoader(data, batch_size=batch_size, pin_memory=True, shuffle=shuffle, num_workers=num_workers) return data def create_data_loaders(self, vocab_size, batch_size, num_workers=6, max_input_len=400, max_target_len=150, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding='post'): start = timer() train = self._create_data_loader('train', vocab_size, batch_size, num_workers=num_workers, max_input_len=max_input_len, max_target_len=max_target_len, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding=padding) test = self._create_data_loader('test', vocab_size, batch_size, num_workers=num_workers, max_input_len=max_input_len, max_target_len=max_target_len, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding=padding) validation = self._create_data_loader('validation', vocab_size, batch_size, num_workers=num_workers, max_input_len=max_input_len, max_target_len=max_target_len, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding=padding) print('preprocessing time: ', timer() - start) return train, test, validation def sequences_to_texts(self, sequences): return self.tokenizer.sequences_to_texts(sequences) def texts_to_sequences(self, texts): return self.tokenizer.texts_to_sequences(texts) def get_eos_token(self): return self.texts_to_sequences([[self.eos_token]]) def get_sos_token(self): return self.texts_to_sequences([[self.sos_token]]) def filters(self): return self.tokenizer.filters class _DataHolder: def __init__(self, data): self.data = data def get_data(self, split): assert type(split) == str if split == 'train': return self.data[0] if split == 'test': return self.data[1] if split == 'validation': return self.data[2] else: raise ValueError('valid values are: "train", "test", "validation".') class _SummarizationDataset(Dataset): def __init__(self, preprocessor, split, vocab_size, max_input_len=400, max_target_len=150, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', to_lower=True, padding='post'): self.data, self.raw_data = preprocessor.load_preprocessed_data(split, vocab_size, max_input_len=max_input_len, max_target_len=max_target_len, filters=filters, to_lower=to_lower, padding=padding) self.len = self.data[0].size(0) def __getitem__(self, index): data = (self.data[0][index], self.data[1][index], self.data[2][index], self.data[3][index], self.data[4][index]) raw_data = self.raw_data[index] return data, raw_data def __len__(self): return self.len
# Obtenemos los datos de salida para el entrenamiento clases = list(sorted(set(t))) size_s = len(clases) + 1 tok2 = Tokenizer() tok2.fit_on_texts(clases) tokens_s = tok2.texts_to_sequences(t) y_train = [np.add.reduce(to_categorical(Y, size_s)) for Y in tokens_s] y_train = np.array(y_train) # CREAMOS LA RED NEURONAL CON KERAS model = keras.models.load_model('model.h5') model.summary() data = "I need to create a new account" data = text_to_word_sequence(data) secuence = tok.texts_to_sequences(data) encode = np.add.reduce(to_categorical(secuence, size)) print("secuence -> {0}".format(secuence)) print("-" * 50) print(np.argmax(to_categorical(secuence, size), axis=1)) print("-" * 50) a = np.rint(model.predict(np.array([encode]))) print(a) i = np.argmax(a, axis=None, out=None) print(tok2.sequences_to_texts(np.array([[i]])))
plt.show() # generate the next 10 words in a sentence best_model = load_model(model_file) start = 'Today as i was leaving for work' test_seqs = tokenizer.texts_to_sequences([start]) for i in range(10): test_seqs_padded = pad_sequences(test_seqs, maxlen=sequence_len, padding='pre', truncating='pre') # use best_model to generate the next word # (remember to convert from categorical to ordinal) # TODO: Replace _ANS_ with your answers next_word = best_model.predict([test_seqs_padded]).argmax(axis=1) test_seqs[0].append(next_word[0]) print(tokenizer.sequences_to_texts(test_seqs))
def next_char(text, temperature=1): x_new = preprocess([text]) y_proba = model.predict(x_new)[0, -1:, :] rescaled_logit = tf.math.log(y_proba) / temperature char_id = tf.random.categorical(rescaled_logit, num_samples=1) + 1 return tokenizer.sequences_to_texts(char_id.numpy())[0]
model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.summary() # ******* # train the model on X (values) and Y (labels) # ******* history = model.fit(X, Y, epochs=num_epochs, verbose=0) # use verbose= 1 or 2 for output on training # *************************************************************** # # predict # # *************************************************************** sentences = ["i have a cute fluffy cat", "the cat is fluffy", "i like to dance"] sequences = tokenizer.texts_to_sequences(sentences) decoded_sentences = tokenizer.sequences_to_texts(sequences) padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len) predictions = model.predict(padded_sequences) for i, prediction in enumerate(predictions): category = category_tags[np.argmax(prediction)] print(f"----- sekntence {i} -----") print(f"decoded text: {decoded_sentences[i]}") print(f"sentence: {sentences[i]} | tag: {category} | prediction values: {prediction} | max prediction index: {np.argmax(prediction)}")
LSTM.fit(x_train, y_train, batch_size=64, epochs=30, validation_split=0.1, callbacks=[early_stopping_cb, reduce_learing_cb]) #prediction string = "I really want to go to a beautiful place" index_list = tokenizer.texts_to_sequences([string])[0] for i in range(4): input_ = np.array(index_list[i:i + max_len - 1]).reshape(1, max_len - 1) predicted_results = LSTM.predict(input_) index_list.append(np.argmax(predicted_results)) word_list = tokenizer.sequences_to_texts([index_list]) output = " ".join(word_list) print(output) dump(tokenizer, open("tokenizer.pkl", "wb")) #creating training samples max_len = 40 x = [] y = [] for i in range(0, len(clean_tokens) - max_len - 1, 8): x_temp = clean_tokens[i:i + max_len] x_temp2 = [wordindex_dic.get(word, 0) for word in x_temp] y_temp = clean_tokens[i + 1:i + max_len + 1] y_temp2 = [wordindex_dic.get(word, 0) for word in y_temp] x.append(x_temp2)
# generated_sequence = generated_sequence.reshape(1,308) for word in range(max_len): #ip_one_hot = one_hot(generated_sequence, num_classes) generated_sequence = generated_sequence.reshape(1,308) prediction = model.predict( generated_sequence[None], verbose = 0)[0] sampled_token = np.random.choice( np.arange(num_classes), p=prediction) #print(generated_sequence) generated_sequence = np.append( generated_sequence[0,1:],sampled_token) print(generated_sequence) #print(generated_sequence) #generated_sequence = generated_sequence.reshape(1,308) #generated_sequence = generated_sequence.astype(int) generated_txt = tk.sequences_to_texts([generated_sequence])[0] print("Sample {}: {}".format(i, generated_txt)) plt.plot(history.history['accuracy']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train accuracy'], loc='upper left') plt.show() # summarize history for loss plt.plot(history.history['loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch')