def build_evaluate_data(lines, tid=0): with open('worddata/word_dict.pkl', 'rb') as f: word_dict = pickle.load(f) def word2id(c): if c in word_dict: return word_dict[c] else: return 0 cnt = 0 history = [] true_utt = [] for line in lines: fields = line.rstrip().lower().split('\t') utterance = fields[-1].split('###') history.append([ list(map(word2id, text_to_word_sequence(each_utt))) for each_utt in utterance ]) true_utt.append(list(map(word2id, text_to_word_sequence(fields[0])))) cnt += 1 if cnt % 10000 == 0: print(tid, cnt) return history, true_utt
def hierarchical_tokenize_and_pad(data, tokenizer=None, max_sequence_len=200, max_sequences=20, enforce_max_len=False, filter_words=False): """ :param data: :param tokenizer: :param max_sequence_len: :param max_sequences: :param enforce_max_len: :param filter_words: :return: """ temp_data = list() for seq in data[:,0]: temp_data.append(' '.join(seq.split())) if tokenizer is None: tokenizer = Tokenizer(filters='!"#$%&()*+,./:;<=>?@[\]^_`{|}~', lower=True) tokenizer.fit_on_texts(temp_data) raw_data = list() max_sequences_actual = -1 max_sequence_len_actual = -1 for seq in data[:, 0]: sentences = nltk.tokenize.sent_tokenize(seq) raw_data.append(sentences) max_sequences_actual = max(len(sentences), max_sequences_actual) for sentence in sentences: word_tokens = text_to_word_sequence(sentence, filters='!"#$%&()*+,./:;<=>?@[\]^_`{|}~', lower=True) max_sequence_len_actual = max(len(word_tokens), max_sequence_len_actual) if not enforce_max_len: max_sequence_len = min(max_sequence_len, max_sequence_len_actual) max_sequences = min(max_sequences, max_sequences_actual) data_x = np.zeros((len(data), max_sequences, max_sequence_len), dtype='int32') print("Max. Seq. Length: %d; Max Seq.: %d" %(max_sequence_len, max_sequences)) index_filter = set() if filter_words: for word, i in tokenizer.word_index.items(): if not (word.isalpha() or "'" in word or "-" in word): index_filter.add(i) for i, sentences in enumerate(raw_data): for j, sentence in enumerate(sentences): if j < max_sequences: k = 0 word_tokens = text_to_word_sequence(' '.join(sentence.split()), filters='!"#$%&()*+,./:;<=>?@[\]^_`{|}~', lower=True) for word in word_tokens: if k < max_sequence_len: if not filter_words or tokenizer.word_index[word] not in index_filter: data_x[i, j, k] = tokenizer.word_index[word] k = k + 1 return data_x, tokenizer, max_sequence_len, max_sequences
def load_data(total_words): process_num = 10 executor = concurrent.futures.ProcessPoolExecutor(process_num) base = 0 results = [] history = [] true_utt = [] word_dict = dict() vectors = [] with open('data/glove.twitter.27B.200d.txt', encoding='utf8') as f: lines = f.readlines() for i, line in enumerate(lines): line = line.split(' ') word_dict[line[0]] = i vectors.append(line[1:]) if i > total_words: break with open('worddata/embedding_matrix.pkl', "wb") as f: pickle.dump(vectors, f) with open("data/biglearn_train.old.txt", encoding="utf8") as f: lines = f.readlines() total_num = 1000000 print(total_num) low = 0 step = total_num // process_num print(step) while True: if low < total_num: results.append( executor.submit(build_data, lines[low:low + step], word_dict, base)) else: break base += 1 low += step for result in results: h, t = result.result() history += h true_utt += t print(len(history)) print(len(true_utt)) pickle.dump([history, true_utt], open("worddata/train.pkl", "wb")) actions_id = [] with open('emb/actions.txt', encoding='utf8') as f: actions = f.readlines() def word2id(c): if c in word_dict: return word_dict[c] else: return 0 for action in actions: actions_id.append( [word2id(word) for word in text_to_word_sequence(action)]) with open('worddata/actions_embeddings.pkl', 'wb') as f: pickle.dump(actions_id, f)
def build_data(lines, word_dict, tid=0): def word2id(c): if c in word_dict: return word_dict[c] else: return 0 cnt = 0 history = [] true_utt = [] for line in lines: fields = line.rstrip().lower().split('\t') utterance = fields[1].split('###') history.append([list(map(word2id, text_to_word_sequence(each_utt))) for each_utt in utterance]) true_utt.append(list(map(word2id, text_to_word_sequence(fields[2])))) cnt += 1 if cnt % 10000 == 0: print(tid, cnt) return history, true_utt
def encode_text_vectors(self, texts, pca_dims=50, tsne_dims=None, tsne_seed=None, return_pca=False, return_tsne=False): # if a single text, force it into a list: if isinstance(texts, str): texts = [texts] vector_output = Model(inputs=self.model.input, outputs=self.model.get_layer('attention').output) encoded_vectors = [] maxlen = self.config['max_length'] for text in texts: if self.config['word_level']: text = text_to_word_sequence(text, filters='') text_aug = [self.META_TOKEN] + list(text[0:maxlen]) encoded_text = textgenrnn_encode_sequence(text_aug, self.vocab, maxlen) encoded_vector = vector_output.predict(encoded_text) encoded_vectors.append(encoded_vector) encoded_vectors = np.squeeze(np.array(encoded_vectors), axis=1) if pca_dims is not None: assert len(texts) > 1, "Must use more than 1 text for PCA" pca = PCA(pca_dims) encoded_vectors = pca.fit_transform(encoded_vectors) if tsne_dims is not None: tsne = TSNE(tsne_dims, random_state=tsne_seed) encoded_vectors = tsne.fit_transform(encoded_vectors) return_objects = encoded_vectors if return_pca or return_tsne: return_objects = [return_objects] if return_pca: return_objects.append(pca) if return_tsne: return_objects.append(tsne) return return_objects
def load_seq2vec_data(account, mod): data = joblib.load(os.path.join(accounts_dir, account + '_tweets.pkl')).text.to_list() date = joblib.load(os.path.join(accounts_dir, account + '_tweets.pkl')).index.to_list() word_index, max_len = joblib.load( os.path.join(pre_data_dir, mod, '{0}_word_index.pkl'.format(mod))) new = [ text_pre.text_to_word_sequence( tweet, filters='!"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n') for tweet in data ] x_pred_seq2vec = [] id_to_drop = [] counter_ind = 0 for tweet in new: sentence = [ word_index.get(i) for i in tweet if word_index.get(i) != None ] if len(sentence) < 1: x_pred_seq2vec.append(['1', '1', '1', '1']) id_to_drop.append(date[counter_ind]) else: x_pred_seq2vec.append(sentence) counter_ind += 1 x_pred_seq2vec = sequence.pad_sequences(x_pred_seq2vec, maxlen=max_len) joblib.dump(x_pred_seq2vec, os.path.join(pre_pred_dir, account, mod, 'x_pred_seq2vec.pkl')) joblib.dump(id_to_drop, os.path.join(pre_pred_dir, account, mod, 'id_to_drop.pkl')) return x_pred_seq2vec
def process(sentence): """ Applies word tokenizing to input sentence and removes the stopwords. """ tokenized_words = text_to_word_sequence(sentence) words = [word for word in tokenized_words if word not in STOPWORDS] return ' '.join(words)
def train_on_texts(self, texts, context_labels=None, batch_size=128, num_epochs=50, verbose=1, new_model=False, gen_epochs=1, train_size=1.0, max_gen_length=300, validation=True, dropout=0.0, via_new_model=False, save_epochs=0, multi_gpu=False, **kwargs): if new_model and not via_new_model: self.train_new_model(texts, context_labels=context_labels, num_epochs=num_epochs, gen_epochs=gen_epochs, train_size=train_size, batch_size=batch_size, dropout=dropout, validation=validation, save_epochs=save_epochs, multi_gpu=multi_gpu, **kwargs) return if context_labels: context_labels = LabelBinarizer().fit_transform(context_labels) if 'prop_keep' in kwargs: train_size = prop_keep if self.config['word_level']: texts = [text_to_word_sequence(text, filters='') for text in texts] # calculate all combinations of text indices + token indices indices_list = [ np.meshgrid(np.array(i), np.arange(len(text) + 1)) for i, text in enumerate(texts) ] indices_list = np.block(indices_list) # If a single text, there will be 2 extra indices, so remove them # Also remove first sequences which use padding if self.config['single_text']: indices_list = indices_list[self.config['max_length']:-2, :] indices_mask = np.random.rand(indices_list.shape[0]) < train_size if multi_gpu: num_gpus = len(K.tensorflow_backend._get_available_gpus()) batch_size = batch_size * num_gpus gen_val = None val_steps = None if train_size < 1.0 and validation: indices_list_val = indices_list[~indices_mask, :] gen_val = generate_sequences_from_texts(texts, indices_list_val, self, context_labels, batch_size) val_steps = max( int(np.floor(indices_list_val.shape[0] / batch_size)), 1) indices_list = indices_list[indices_mask, :] num_tokens = indices_list.shape[0] assert num_tokens >= batch_size, "Fewer tokens than batch_size." level = 'word' if self.config['word_level'] else 'character' print("Training on {:,} {} sequences.".format(num_tokens, level)) steps_per_epoch = max(int(np.floor(num_tokens / batch_size)), 1) gen = generate_sequences_from_texts(texts, indices_list, self, context_labels, batch_size) base_lr = 4e-3 # scheduler function must be defined inline. def lr_linear_decay(epoch): return (base_lr * (1 - (epoch / num_epochs))) if context_labels is not None: if new_model: weights_path = None else: weights_path = "{}_weights.hdf5".format(self.config['name']) self.save(weights_path) self.model = textgenrnn_model(self.num_classes, dropout=dropout, cfg=self.config, context_size=context_labels.shape[1], weights_path=weights_path) model_t = self.model if multi_gpu: # Do not locate model/merge on CPU since sample sizes are small. parallel_model = multi_gpu_model(self.model, gpus=num_gpus, cpu_merge=False) parallel_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=4e-3, rho=0.99)) model_t = parallel_model print("Training on {} GPUs.".format(num_gpus)) model_t.fit_generator(gen, steps_per_epoch=steps_per_epoch, epochs=num_epochs, callbacks=[ LearningRateScheduler(lr_linear_decay), generate_after_epoch(self, gen_epochs, max_gen_length), save_model_weights(self, num_epochs, save_epochs) ], verbose=verbose, max_queue_size=10, validation_data=gen_val, validation_steps=val_steps) # Keep the text-only version of the model if using context labels if context_labels is not None: self.model = Model(inputs=self.model.input[0], outputs=self.model.output[1])
def split_and_pad(input_string): tokenised_string = text_to_word_sequence(input_string) return ' '.join( tokenised_string[:min(self.max_seq_len, len(tokenised_string) )])
def test_text_to_word_sequence_unicode_multichar_split(self): text = u'ali!stopveli?stopkırkstopdokuzstopelli' seq = preprocessing_text.text_to_word_sequence(text, split='stop') self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
def test_text_to_word_sequence_unicode(self): text = u'ali! veli? kırk dokuz elli' seq = preprocessing_text.text_to_word_sequence(text) self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
def test_text_to_word_sequence_multichar_split(self): text = 'hello!stop?world!' seq = preprocessing_text.text_to_word_sequence(text, split='stop') self.assertEqual(seq, ['hello', 'world'])
def test_text_to_word_sequence(self): text = 'hello! ? world!' seq = preprocessing_text.text_to_word_sequence(text) self.assertEqual(seq, ['hello', 'world'])
try: stopword = stopwords.words('english') except: nltk.download('stopwords') nltk.download('punkt') stopword = stopwords.words('english') x = ' '.join(stopword) nt = re.findall("\w+'t", x) stopword = set(stopword) - ({'no', 'nor', 'not', 'only', 'too'} | set(nt)) # preprocessing of description data['clean_docs'] = data['Description'].str.replace("[^a-zA-Z']", ' ') data['clean_docs'] = data['clean_docs'].str.replace("'[a-su-zA-SU-Z]", ' ') data['clean_docs'] = data['clean_docs'].str.replace(' +', ' ') data['clean_docs'] = data['clean_docs'].str.lower() from tensorflow.python.keras.preprocessing.text import text_to_word_sequence data['clean_docs'] = data['clean_docs'].apply( lambda x: [t for t in text_to_word_sequence(x) if t not in stopword and len(t) > 1]) # remove empty row data['clean_docs'].replace('', np.nan, inplace=True) data = data[data['clean_docs'].notna()] # concatenating all clean docs data['clean_docs'] = data['clean_docs'].apply('#'.join) data.to_csv('../src/database/final_perfume_data_clean_with_clean_docs.csv', index=False)
def user_input_preprocessing(user_input): res = re.sub("[^a-zA-Z']", ' ', user_input) res = re.sub("'[a-su-zA-SU-Z]", ' ',res) res = text_to_word_sequence(res) return res
def clean_sentence(self, sentence): return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')