def text_to_word_sequence(input_text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' '): """Converts a text to a sequence of words (or tokens). This function transforms a string of text into a list of words while ignoring `filters` which include punctuations by default. >>> sample_text = 'This is a sample sentence.' >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text) ['this', 'is', 'a', 'sample', 'sentence'] Arguments: input_text: Input text (string). filters: list (or concatenation) of characters to filter out, such as punctuation. Default: `'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'`, includes basic punctuation, tabs, and newlines. lower: boolean. Whether to convert the input to lowercase. split: str. Separator for word splitting. Returns: A list of words (or tokens). """ return text.text_to_word_sequence( input_text, filters=filters, lower=lower, split=split)
def texts_to_sequences_generator(self, texts): """Transforms each text in `texts` in a sequence of integers. Each item in texts can also be a list, in which case we assume each item of that list to be a token. Only top "num_words" most frequent words will be taken into account. Only words known by the tokenizer will be taken into account. # Arguments texts: A list of texts (strings). # Yields Yields individual sequences. """ num_words = self.num_words for text in texts: if self.char_level or isinstance(text, list): seq = text else: seq = text_to_word_sequence(text, self.filters, self.lower, self.split) vect = [] for w in seq: vect.append(self.resolve_word_or_oov(w)) yield vect
def fit_on_texts(self, texts): """Updates internal vocabulary based on a list of texts. In the case where texts contains lists, we assume each entry of the lists to be a token. Required before using `texts_to_sequences` or `texts_to_matrix`. # Arguments texts: can be a list of strings, a generator of strings (for memory-efficiency), or a list of list of strings. """ for text in texts: self.document_count += 1 if self.char_level or isinstance(text, list): if self.lower: if isinstance(text, list): text = [text_elem.lower() for text_elem in text] else: text = text.lower() seq = text else: seq = text_to_word_sequence(text, self.filters, self.lower, self.split) for w in seq: if w in self.word_counts: self.word_counts[w] += 1 else: self.word_counts[w] = 1 for w in set(seq): # In how many documents each word occurs self.word_docs[w] += 1 wcounts = list(self.word_counts.items()) wcounts.sort(key=lambda x: x[1], reverse=True) sorted_voc = [] if self.start_token is not None: sorted_voc.append(self.start_token) if self.end_token is not None: sorted_voc.append(self.end_token) if self.oov_token is not None: sorted_voc.append(self.oov_token) sorted_voc.extend(wc[0] for wc in wcounts) # note that index 0 is reserved, never assigned to an existing word self.word_index = dict( list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))) self.index_word = dict((c, w) for w, c in self.word_index.items()) for w, c in list(self.word_docs.items()): self.index_docs[self.word_index[w]] = c
def texts_to_sequences(self, texts): sentence_features = np.zeros( (len(texts), self.padding, self.e.get_dimension())) for j, text in enumerate(texts): words = text_to_word_sequence(text) for i, word in enumerate(words): if i >= self.padding: break sentence_features[j, i, :] = self.e.get_word_vector(word) return sentence_features
def run(self): with open(self.xml, encoding="utf-8") as fd: tree = xmltodict.parse(fd.read(), xml_attribs=False, force_list=True) document = getFullText(tree) doc_id = search(tree, "id")[0] text = clean_text(document) words = text_to_word_sequence(text) filtered_doc = [w.lower() for w in words if not w in self.stop_words and w != '' and w.isalpha() and len(w)>1] self.corpus[doc_id] = dict((i, j) for (i, j) in nltk.Counter(filtered_doc).items()) self.corpusWcount[doc_id] = filtered_doc
def get_word2vec_embedding(data): sentences_words = list( text_to_word_sequence(x, filters=[], lower=False) for x in data) model = Word2Vec(sentences_words, size=embedding_length, workers=4, min_count=1, sg=1, hs=1, iter=5) print("Number of word vectors: {}".format(len(model.wv.vocab))) return model.wv
def read_sequence_from_file(self, filename): def get_mapping(word): if word in self.word_to_id.keys(): mapping = self.word_to_id[word] if self.skip_top <= mapping < self.max_features: return mapping # Out of vocabulary char return 2 with open(filename) as file: word_seq = text_to_word_sequence(file.read()) index_seq = [get_mapping(word) for word in word_seq] index_seq.insert(0, 1) return index_seq
def create_corpus(output_file_name, data): size = len(data) output_corpus = open(output_file_name, "w", encoding="utf-8") my_filters = '"#$&()*+/:;<=>?@[\\]^_`{|}~\t\n' with click.progressbar(length=len_data, label="CREATE CORPUS: ", fill_char=click.style('=', fg='white')) as bar: for i in range(0, len_data): tmp = str(data[i]).lower() tmp = text.text_to_word_sequence(text=tmp, filters=my_filters) tmp = " ".join(map(str, tmp)) output_corpus.write(tmp + "\n") bar.update(1) output_corpus.close()
def texts_to_sequences_generator(self, texts): """Transforms each text in `texts` to a sequence of integers. Each item in texts can also be a list, in which case we assume each item of that list to be a token. Only top "num_words" most frequent words will be taken into account. Only words known by the tokenizer will be taken into account. # Arguments texts: A list of texts (strings). # Yields Yields individual sequences. """ num_words = self.num_words oov_token_index = self.word_index.get(self.oov_token) end_token_index = self.word_index.get(self.end_token) start_token_index = self.word_index.get(self.start_token) for text in texts: if self.char_level or isinstance(text, list): if self.lower: if isinstance(text, list): text = [text_elem.lower() for text_elem in text] else: text = text.lower() seq = text else: seq = text_to_word_sequence(text, self.filters, self.lower, self.split) vect = [] if self.start_token is not None: vect.append(start_token_index) for w in seq: i = self.word_index.get(w) if i is not None: if num_words and i >= num_words: if oov_token_index is not None: vect.append(oov_token_index) else: vect.append(i) elif self.oov_token is not None: vect.append(oov_token_index) if self.end_token is not None: vect.append(end_token_index) yield vect
def fit_on_texts(self, texts): """Updates internal vocabulary based on a list of texts. In the case where texts contains lists, we assume each entry of the lists to be a token. Required before using `texts_to_sequences` or `texts_to_matrix`. # Arguments texts: can be a list of strings, a generator of strings (for memory-efficiency), or a list of list of strings. """ for text in texts: self.document_count += 1 if self.char_level or isinstance(text, list): seq = text else: seq = text_to_word_sequence(text, self.filters, self.lower, self.split) for w in seq: if w in self.word_counts: self.word_counts[w] += 1 else: self.word_counts[w] = 1 for w in set(seq): if w in self.word_docs: self.word_docs[w] += 1 else: self.word_docs[w] = 1 wcounts = list(self.word_counts.items()) wcounts.sort(key=lambda x: x[1], reverse=True) sorted_voc = [wc[0] for wc in wcounts] # note that indices 0 and 1 are reserved, never assigned to an existing word self.word_index = dict( list(zip(sorted_voc, list(range(2, len(sorted_voc) + 2))))) # index 1 is reserved for the oov token if self.oov_token is not None: i = self.word_index.get(self.oov_token) if i is None: self.word_index[self.oov_token] = 1 for w, c in list(self.word_docs.items()): self.index_docs[self.word_index[w]] = c
def prep_1(text): text = "The quick brown fox jumped over the lazy dog." list_unique_words = list(set(text_to_word_sequence(text))) print(f"docs: {list_unique_words[:100]}") vocab_size = len(list_unique_words) print(f"vocab_size: {vocab_size}") oh_encoding = one_hot(text, n=round(vocab_size * 1.3)) print(f"oh_encoding: {oh_encoding}") hashed_doc = hashing_trick(text, n=round(vocab_size * 1.3), hash_function='md5') print(f"hashed_doc: {hashed_doc}") return oh_encoding
def process(tweet): stop_words = get_stop_words() base_filters = '\n\t!"#$%&()*+,-–./:;<=>?[\]^_`{|}~ 0123456789' tweet = str(tweet) tweet = re.sub(r'^RT[\s]+', '', tweet) # remove old style retweet text "RT" tweet = re.sub( r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', tweet) # remove hyperlinks tweet = tweet.replace('\'', '') new_list = [ x for x in text_to_word_sequence(tweet, filters=base_filters, lower=True) if not x.startswith("@") ] final = [w for w in new_list if not w in stop_words] return final
def get_query_and_candidate_terms(self, sequence_length=20): stop_words = set(stopwords.words('english')) query_texts = list() candidate_terms = list() query_terms_texts = list() keyword_terms_texts = list() for query in self.query_list: query_texts.append(query.query) terms = [''] * (sequence_length * 2) query_terms = text_to_word_sequence(query.query) terms[:sequence_length] = [ term for term in query_terms if term not in stop_words ][:sequence_length] terms[sequence_length:] = [ keyword for _, keyword in query.keywords ][:sequence_length] candidate_terms.append(terms) query_terms_texts.append(' '.join(terms[:sequence_length])) keyword_terms_texts.append(' '.join(terms[sequence_length:])) query_sequence = self.tokenizer.texts_to_sequences(query_texts) query_sequence = pad_sequences(query_sequence, maxlen=sequence_length * 2) query_terms_sequence = self.tokenizer.texts_to_sequences( query_terms_texts) query_terms_sequence = pad_sequences(query_terms_sequence, maxlen=sequence_length, padding='post') keyword_terms_sequence = self.tokenizer.texts_to_sequences( keyword_terms_texts) keyword_terms_sequence = pad_sequences(keyword_terms_sequence, maxlen=sequence_length) terms_sequence = np.hstack( [query_terms_sequence, keyword_terms_sequence]) return self.query_list, query_sequence, terms_sequence, candidate_terms
def s_r(self, t, r, stop_words, size=80, node=None): if isinstance(t, dict): for (k, v) in t.items(): for i, j in enumerate(v, 0): words = text_to_word_sequence(clean_text(getFullText(j))) words = [ w.lower() for w in words if not w in stop_words and w != '' and w.isalpha() and len(w) > 1 ] node = node if node != None else "/" if len(words) > size: if k == "#text": r[node] = words else: r["{}/{}[{}]".format(node, k, i + 1)] = words self.s_r(j, r, stop_words, size, node="{}/{}[{}]".format(node, k, i + 1)) return r
def test_text_to_word_sequence_unicode(): sample_text = u'ali! veli? kırk dokuz elli' assert text.text_to_word_sequence(sample_text) == [ u'ali', u'veli', u'kırk', u'dokuz', u'elli' ]
def test_text_to_word_sequence_multichar_split(): sample_text = 'hello!stop?world!' assert text.text_to_word_sequence(sample_text, split='stop') == ['hello', 'world']
def test_text_to_word_sequence(): sample_text = 'hello! ? world!' assert text.text_to_word_sequence(sample_text) == ['hello', 'world']
def default_tokenizer(text: str) -> List[str]: """Default function to tokenize text.""" return text_to_word_sequence(text, lower=False, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
def embed(x_lstm, y, x_fe, x_test, x_test_lstm, y_test, needs_embed): """ Embeds the text in the training and test set using the wiki-news pre-trained vectors. :param x_lstm: training set text :param y: training set labels :param x_fe: training set features :param x_test: test set features :param x_test_lstm: test set text :param y_test: test set labels :param needs_embed: boolean if you want to embed the text -> text should be embedded if the project does not include "embedded_matrix", "embedded_text" and "embedded_text_test" files. To embed, "wiki-news-300d-1M.vec" should be included in the project """ max_len = 197 if needs_embed: #for train and validation set word2index, embedding_matrix = load_embeddings('wiki-news-300d-1M.vec', embedding_dim=300) out_matrix = [] for text in x_lstm['text'].tolist(): indices = [] for w in text_to_word_sequence(text): indices.append(word2index[re.sub(r'[^\w\s]', '', w)]) if len(indices) > max_len: max_len = len(indices) out_matrix.append(indices) encoded_texts = out_matrix padded_texts = pad_sequences(encoded_texts, maxlen=max_len, padding='post') store_data(padded_texts, 'embedded_text') store_data(embedding_matrix, 'embedded_matrix') # for test set word2index, embedding_matrix = load_embeddings('wiki-news-300d-1M.vec', embedding_dim=300) out_matrix = [] for text in x_test_lstm['text'].tolist(): indices = [] for w in text_to_word_sequence(text): # Scotfree is present in the data, but it is not in wiki-news and # throws error if w == 'scotfree': continue indices.append(word2index[re.sub(r'[^\w\s]', '', w)]) if len(indices) > max_len: max_len = len(indices) out_matrix.append(indices) encoded_texts = out_matrix padded_texts = pad_sequences(encoded_texts, maxlen=max_len, padding='post') store_data(padded_texts, 'embedded_text_test') embedding_matrix = load_data('embedded_matrix') padded_texts = load_data('embedded_text') embedding_matrix_test = load_data('embedded_text_test') for idx, el in enumerate(padded_texts): dataframes[0]['text'][idx] = el for idx, el in enumerate(embedding_matrix_test): dataframes[3]['text'][idx] = el x_test_lstm = dataframes[3]['text'] do_kfold_validation(x_fe, y, embedding_matrix, max_len, x_test, x_test_lstm, y_test)
sep=',', header=0, encoding='latin-1') else: raise ValueError( "Inputfile is missing! Invalid action or not existing \"existingModelPath\" need!" ) print(input_data['target'].value_counts()) #Tokenize Tweets tweets = input_data['text'].tolist() train_sequences = [] for tweet in tweets: train_sequences.append(text_to_word_sequence(str(tweet))) if str(args.embedding).lower() == 'bert': print('Using BERT embedding') if args.action == 'new': print('Training new Model') train_x, test_x, train_y, test_y = train_test_split( train_sequences, input_data['target'], test_size=0.1, random_state=1234) train_x, valid_x, train_y, valid_y = train_test_split( train_x, train_y, test_size=0.1, random_state=1234)
def test_text_to_word_sequence_unicode_multichar_split(): sample_text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text.text_to_word_sequence(sample_text, split='stop') == [ u'ali', u'veli', u'kırk', u'dokuz', u'elli' ]
def keras_tokenize(text): text = clean_text(text) tokens = text_to_word_sequence(text) return tokens
from keras_preprocessing.text import text_to_word_sequence from keras_preprocessing.text import Tokenizer from keras_preprocessing.text import one_hot text = "Hei, dette er noe testtext" tronder_file = open("TextInput/rawText.txt", "r", encoding="utf-8") tronder_text = tronder_file.read() tronder_file.close() one_hot_result = one_hot(tronder_text, len(tronder_text)) ttws_result = text_to_word_sequence(tronder_text) print(ttws_result) print(one_hot_result) print(len(ttws_result)) print(len(one_hot_result))