def test_tokenizer_oov_flag(self): x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = preprocessing_text.Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) self.assertEqual(len(x_test_seq[0]), 4) # discards 2 OOVs # With OOV feature tokenizer = preprocessing_text.Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) self.assertEqual(len(x_test_seq[0]), 6) # OOVs marked in place
def sequence_vectorize(train_texts, val_texts): """ Vectorizes texts as sequence vectors 1 text = 1 sequence vector with fixed length # Returns x_train, x_val, word_index: vectorized training and validation texts and word index dictionary """ tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) max_length = len(max(x_train, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # if shorter then padded in the beginning if longer then truncated x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) return x_train, x_val, tokenizer.word_index
def sequence_vectorize(train_texts, val_texts): """Vectorizes texts as sequence vectors. 1 text = 1 sequence vector with fixed length. # Arguments train_texts: list, training text strings. val_texts: list, validation text strings. # Returns x_train, x_val, word_index: vectorized training and validation texts and word index dictionary. """ # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) return x_train, x_val, tokenizer.word_index
def sequentialize_data(train_contents, val_contents=None): """Vectorize data into ngram vectors. Args: train_contents: training instances val_contents: validation instances y_train: labels of train data. Returns: sparse ngram vectors of train, valid text inputs. """ tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE) tokenizer.fit_on_texts(train_contents) x_train = tokenizer.texts_to_sequences(train_contents) if val_contents: x_val = tokenizer.texts_to_sequences(val_contents) max_length = len(max(x_train, key=len)) if max_length > MAX_SEQ_LENGTH: max_length = MAX_SEQ_LENGTH x_train = sequence.pad_sequences(x_train, maxlen=max_length) if val_contents: x_val = sequence.pad_sequences(x_val, maxlen=max_length) word_index = tokenizer.word_index num_features = min(len(word_index) + 1, MAX_VOCAB_SIZE) if val_contents: return x_train, x_val, word_index, num_features, tokenizer, max_length else: return x_train, word_index, num_features, tokenizer, max_length
def tokenizer_from_json(json_string): """Parses a JSON tokenizer configuration file and returns a tokenizer instance. # Arguments json_string: JSON string encoding a tokenizer configuration. # Returns A Keras Tokenizer instance """ tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = text.Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
def test_tokenizer_unicode(self): texts = [ u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz' ] tokenizer = preprocessing_text.Tokenizer(num_words=5) tokenizer.fit_on_texts(texts) self.assertEqual(len(tokenizer.word_counts), 5)
def train_and_evaluate(output_dir, hparams): tf.summary.FileWriterCache.clear( ) # ensure filewriter cache is clear for TensorBoard events file # Load Data ((train_texts, train_labels), (test_texts, test_labels)) = load_hacker_news_data(hparams['train_data_path'], hparams['eval_data_path']) # Create vocabulary from training corpus. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) # Generate vocabulary file from tokenizer object to enable # creating a native tensorflow lookup table later (see vectorize_sentences()) tf.gfile.MkDir( output_dir) # directory must exist before we can use tf.gfile.open global VOCAB_FILE_PATH VOCAB_FILE_PATH = os.path.join(output_dir, 'vocab.txt') with tf.gfile.Open(VOCAB_FILE_PATH, 'wb') as f: f.write("{},0\n".format(PADWORD)) # map padword to 0 for word, index in tokenizer.word_index.items(): if index < TOP_K: # only save mappings for TOP_K words f.write("{},{}\n".format(word, index)) # Create estimator run_config = tf.estimator.RunConfig(save_checkpoints_steps=500) estimator = keras_estimator(model_dir=output_dir, config=run_config, learning_rate=hparams['learning_rate'], embedding_path=hparams['embedding_path'], word_index=tokenizer.word_index) # Create TrainSpec train_steps = hparams['num_epochs'] * len( train_texts) / hparams['batch_size'] train_spec = tf.estimator.TrainSpec( input_fn=lambda: input_fn(train_texts, train_labels, hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), max_steps=train_steps) # Create EvalSpec exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: input_fn(test_texts, test_labels, hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), steps=None, exporters=exporter, start_delay_secs=10, throttle_secs=10) # Start training tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def seq2int(data: 'tuple from getData()', mod: 'concat name of model data'): """ Assing string data to random integrals, takes the tuple from getData(), returns tranform tuple and word_index. input: getData() ==> (train_data, y_train):80%, (test_data, y_test):20% output: int(train_data, y_train), int(test_data, y_test), word_index """ (x_train, y_train), (x_test, y_test) = data try: # checking if we already have the data since there are no reason to make it more than once x_train = joblib.load( os.path.join(pre_data_dir, mod, '{0}_x_train_sequence.pkl'.format(mod))) x_test = joblib.load( os.path.join(pre_data_dir, mod, '{0}_x_test_sequence.pkl'.format(mod))) tokenizer = joblib.load( os.path.join(pre_data_dir, mod, '{0}_word_index.pkl'.format(mod))) return (x_train, y_train), (x_test, y_test), tokenizer[0] except: # seting maximum for different words TOP_K = 20000 # setting maximum length for array of words MAX_SEQUENCE_LENGTH = 500 # stating tranformer tokenizer = text.Tokenizer(num_words=TOP_K) # fitting on train set tokenizer.fit_on_texts(x_train) # tranforming on train/test sets x_train = tokenizer.texts_to_sequences(x_train) x_test = tokenizer.texts_to_sequences(x_test) # checking if the longest data is longer than the MAX_SEQUENCE_LENGTH # since we work with tweets, MAX_SEQUENCE_LENGTH is always greater. max_length = len(max(x_train, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # padding sequence so data base has always same length x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_test = sequence.pad_sequences(x_test, maxlen=max_length) # saving final database joblib.dump( x_train, os.path.join(pre_data_dir, mod, '{0}_x_train_sequence.pkl'.format(mod))) joblib.dump( x_test, os.path.join(pre_data_dir, mod, '{0}_x_test_sequence.pkl'.format(mod))) joblib.dump([tokenizer.word_index, max_length], os.path.join(pre_data_dir, mod, '{0}_word_index.pkl'.format(mod))) return (x_train, y_train), (x_test, y_test), tokenizer.word_index
def tokenized_seq_vectors(X_train, X_test): tokenizer = text.Tokenizer(num_words=1000) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) maxlength = len(max(X_train, key=len)) X_train = sequence.pad_sequences(X_train, maxlen=maxlength) X_test = sequence.pad_sequences(X_test, maxlen=maxlength) index = tokenizer.word_index return X_train, X_test, index
def sequentialize_data(train_contents): MAX_VOCAB_SIZE = 200000 tokenizer = text.Tokenizer(num_words=MAX_VOCAB_SIZE) tokenizer.fit_on_texts(train_contents) x_train = tokenizer.texts_to_sequences(train_contents) max_length = len(max(x_train, key=len)) x_train = sequence.pad_sequences(x_train, maxlen=max_length) word_index = tokenizer.word_index num_features = min(len(word_index) + 1, MAX_VOCAB_SIZE) return x_train, word_index, num_features, tokenizer, max_length
def train_and_evaluate(output_dir, hparams): tf.summary.FileWriterCache.clear() # ensure filewriter cache is clear for TensorBoard events file # Load Data ((train_texts, train_labels), (test_texts, test_labels)) = load_hacker_news_data( hparams['train_data_path'], hparams['eval_data_path']) # Create vocabulary from training corpus. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) # Save token dictionary to use during prediction time pickle.dump(tokenizer, open('tokenizer.pickled', 'wb')) # Create estimator run_config = tf.estimator.RunConfig(save_checkpoints_steps=500) # TODO: create estimator estimator = keras_estimator( model_dir=output_dir, config=run_config, learning_rate=hparams['learning_rate'], embedding_path=hparams['embedding_path'], word_index=tokenizer.word_index ) # Create TrainSpec train_steps = hparams['num_epochs'] * len(train_texts) / hparams['batch_size'] train_spec = tf.estimator.TrainSpec( input_fn=input_fn( train_texts, train_labels, tokenizer, hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), max_steps=train_steps ) # Create EvalSpec exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=input_fn( test_texts, test_labels, tokenizer, hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), steps=None, exporters=exporter, start_delay_secs=10, throttle_secs=10 ) # Start training tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(output_dir, hparams): tf.compat.v1.summary.FileWriterCache.clear( ) # ensure filewriter cache is clear for TensorBoard events file (train_texts, train_labels), (test_texts, test_labels) = load_train_eval( hparams['train_data_path'], hparams['eval_data_path']) tokenizer = text.Tokenizer() tokenizer.fit_on_texts(train_texts) tf.io.gfile.mkdir( output_dir) # directory must exist before we can use tf.gfile.open global VOCAB_FILE_PATH VOCAB_FILE_PATH = os.path.join(output_dir, 'vocab.txt') with tf.io.gfile.GFile(VOCAB_FILE_PATH, 'wb') as f: f.write("{},0\n".format(PADWORD)) # map padword to 0 for word, index in tokenizer.word_index.items(): # only save mappings for TOP_K words f.write("{},{}\n".format(word, index)) runconfig = tf.estimator.RunConfig(save_checkpoints_steps=500) estimator = keras_estimator(model_dir=output_dir, config=runconfig, learning_rate=hparams['learning_rate'], embedding_path=hparams['embedding_path'], word_index=tokenizer.word_index, embedding_dim=hparams['embedding_dim']) # Create TrainSpec train_steps = hparams['num_epochs'] * len( train_texts) / hparams['batch_size'] train_spec = tf.estimator.TrainSpec( input_fn=lambda: input_fn(train_texts, train_labels, hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), max_steps=train_steps) # Create EvalSpec exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: input_fn(test_texts, test_labels, hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), steps=None, exporters=exporter, start_delay_secs=10, throttle_secs=10) # Start training tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def sequence_vectorizer(X_train, X_val): tokenizer = text.Tokenizer(num_words=maximum_features) tokenizer.fit_on_texts(X_train) X_train = tokenizer.texts_to_sequences(X_train) X_val = tokenizer.texts_to_sequences(X_val) maximum_length = len(max(X_train, key=len)) if (maximum_length > max_sequence_length): maximum_length = max_sequence_length print(maximum_length) X_train = sequence.pad_sequences(X_train, maxlen=maximum_length) X_val = sequence.pad_sequences(X_val, maxlen=maximum_length) return X_train, X_val, tokenizer.word_index, tokenizer
def test_sequential_fit(self): texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.' ] word_sequences = [['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing']] tokenizer = preprocessing_text.Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) self.assertEqual(tokenizer.document_count, 5) tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
def __init__(self, num_words = 2000, maxlen = 200, padding = 'post', truncating = 'post'): # Maximo de palabras self.num_words = num_words # Tokenizador self.tokenizer = text.Tokenizer( num_words = self.num_words, lower = True, split = ' ' ) # Maxima logitud del vector self.maxlen = maxlen # Añadir ceros self.padding = padding # Truncar frases self.truncating = truncating
def vectorize_data(training_text, validation_text, test_text): glyphs = " abcdefghijklmnopqrstuvwxyz" #trn = [' '.join([j for j in i]) for i in training_text] #val = [' '.join([j for j in i]) for i in validation_text] tokenizer = text.Tokenizer(lower=True, char_level=True, oov_token='@') tokenizer.fit_on_texts(training_text + validation_text + test_text) train = tokenizer.texts_to_sequences(training_text) validate = tokenizer.texts_to_sequences(validation_text) testing = tokenizer.texts_to_sequences(test_text) glyph_dictionary = tokenizer.word_index train = sequence.pad_sequences(train, maxlen=MAX_WORD_LENGTH, padding='post') validate = sequence.pad_sequences(validate, maxlen=MAX_WORD_LENGTH, padding='post') testing = sequence.pad_sequences(testing, maxlen=MAX_WORD_LENGTH, padding='post') return train, validate, testing, glyph_dictionary, tokenizer
def tokenize_vectorize(trainTexts, testTexts): #Tokenization and Vectorisation for sequence models, this method assumes that order of words is important in text, and is better for CNN and RNN #Create vocabulary with training texts tokenizer = text.Tokenizer(num_words=TOP_K, lower=False) tokenizer.fit_on_texts(trainTexts.text) word_index_text = tokenizer.word_index #Create vocabulary with training title tokenizer.fit_on_texts(trainTexts.title) word_index_title = tokenizer.word_index #Vectorize the training and validation texts trainSetText = tokenizer.texts_to_sequences(trainTexts.text) testSetText = tokenizer.texts_to_sequences(testTexts.text) trainSetTitle = tokenizer.texts_to_sequences(trainTexts.title) testSetTitle = tokenizer.texts_to_sequences(testTexts.title) #Get max sequence length max_length = len(max(trainSetText, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH #Fix sequence length to max value. #The sequence is padded in the beginning if shorter than the length #and longer sequences are truncated trainSetText = sequence.pad_sequences(trainSetText, maxlen=max_length) trainSetTitle = sequence.pad_sequences(trainSetTitle, maxlen=max_length) testSetText = sequence.pad_sequences(testSetText, maxlen=max_length) testSetTitle = sequence.pad_sequences(testSetTitle, maxlen=max_length) trainSetText = numpy.array(trainSetText) trainSetTitle = numpy.array(trainSetTitle) testSetText = numpy.array(testSetText) testSetTitle = numpy.array(testSetTitle) #Commented out to fit with other changes to the classifier model #trainSetText = trainSetText.reshape((trainSetText.shape[0], trainSetText.shape[1], 1)) #testSetText = testSetText.reshape((testSetText.shape[0], testSetText.shape[1], 1)) #trainSetTitle = trainSetTitle.reshape((trainSetTitle.shape[0], trainSetTitle.shape[1], 1)) #testSetTitle = testSetTitle.reshape((testSetTitle.shape[0], testSetTitle.shape[1], 1)) #Shape should be 35918, 500, 1 X_train = [trainSetText, trainSetTitle] X_test = [testSetText, testSetTitle] #Labels- Converting labels to binary vectors Y_train = to_categorical(trainTexts.label, num_classes=2) Y_test = to_categorical(testTexts.label, num_classes=2) return X_train, X_test, Y_train, Y_test, word_index_text, word_index_title
def train_and_evaluate(output_dir, hparams): # Load Data ((train_texts, train_labels), (test_texts, test_labels)) = load_hacker_news_data( hparams['train_data_path'], hparams['eval_data_path']) # Create vocabulary from training corpus. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) # Save token dictionary to use during prediction time pickle.dump(tokenizer, open('tokenizer.pickled', 'wb')) # Create estimator run_config = tf.estimator.RunConfig(save_checkpoints_steps=1000) estimator = # TODO: create estimator # Create TrainSpec train_steps = hparams['num_epochs'] * len(train_texts) / hparams['batch_size'] train_spec = tf.estimator.TrainSpec( input_fn=input_fn( train_texts, train_labels, tokenizer, hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), max_steps=train_steps ) # Create EvalSpec exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=input_fn( test_texts, test_labels, tokenizer, hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), steps=None, exporters=exporter, start_delay_secs=10, throttle_secs=10 ) # Start training tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_predict(X_train, X_test, Y_train, Y_test): max_words = 1226 tokenize = text.Tokenizer(num_words=max_words, char_level=False) tokenize.fit_on_texts(X_train) x_train = tokenize.texts_to_matrix(X_train) x_test = tokenize.texts_to_matrix(X_test) encoder = LabelEncoder() encoder.fit(Y_train) y_train = encoder.transform(Y_train) y_test = encoder.transform(Y_test) num_classes = np.max(y_train) + 1 y_train = utils.to_categorical(y_train, num_classes) y_test = utils.to_categorical(y_test, num_classes) batch_size = 32 epochs = 1000 # Build the model model = Sequential() model.add(Dense(512, input_shape=(max_words, ))) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) tensorboard = TensorBoard(log_dir="logs/{}".format(time.time())) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1, callbacks=[tensorboard]) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print('Test accuracy:', score[1])
def test_tokenizer(self): texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.' ] tokenizer = preprocessing_text.Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) self.assertLess(np.max(np.max(sequences)), 10) self.assertEqual(np.min(np.min(sequences)), 1) tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode) self.assertEqual(matrix.shape, (3, 10))
def sequence_vectorize(train_texts, val_texts, k=1): """Vectorizes texts as sequence vectors. 1 text = 1 sequence vector with fixed length. # Arguments train_texts: list, training text strings. val_texts: list, validation text strings. # Returns x_train, x_val, word_index: vectorized training and validation texts and word index dictionary. """ print('Tokenizing') # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) print('Vectorizing') # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. print('Padding/Truncating Sequences') x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) # Save Tokenizer to Disk print('Saving Tokenizer') tokenConfig = tokenizer.to_json() f = open('amazon_sepcnn_' + str(k) + 'k_tokenizer.json', 'w') f.write(tokenConfig) f.close() return x_train, x_val, tokenizer.word_index
def __init__(self, vectorizer_mode=TFIDF_MODE, max_features=60000, verbose=False): self.le = LabelEncoder() self.stemmer = SnowballStemmer("english") self.vectorizer_mode = vectorizer_mode self.max_features = max_features self.verbose = verbose if vectorizer_mode == TFIDF_MODE: self.tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=self.max_features, ngram_range=(1, 2), token_pattern=token_pattern, tokenizer=custom_tokenizer) elif vectorizer_mode == EMBEDDING_MODE: self.tokenizer = text.Tokenizer(num_words=self.max_features)
def sequence_vectorize(train_texts, val_texts,number_of_features,max_sequence_length): # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=number_of_features) tokenizer.fit_on_texts(train_texts) # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > max_sequence_length: max_length = max_sequence_length # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) return x_train, x_val, tokenizer.word_index
def sequence_vectorize(train, val, test, num_words=10000, max_seq_length=100): """ Vectorizes texts as sequence vectors. 1 text = 1 sequence vector with fixed length. Args: train: list, training speeches val: list, validation speeches test: list, test speeches Kwargs: num_words: int, number of words to keep max_seq_length: int, make all sequences of this length Returns: x_train, x_val, x_test, word_index: vectorized training, validation, test speeches and word index dictionary. """ # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=num_words, lower=True, oov_token='<unk>') tokenizer.fit_on_texts(train_texts) # Vectorize training and validation texts. # Transforms each text to a sequence of integers. x_train = tokenizer.texts_to_sequences(train) x_val = tokenizer.texts_to_sequences(val) x_test = tokenizer.texts_to_sequences(test) # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_train = sequence.pad_sequences(x_train, maxlen=max_seq_length) x_val = sequence.pad_sequences(x_val, maxlen=max_seq_length) x_test = sequence.pad_sequences(x_test, maxlen=max_seq_length) return x_train, x_val, x_test, tokenizer.word_index
def sequence_vectorize(pre, hyp, top_k=20000, max_seq_len=500): """Vectorizes texts as sequence vectors.""" # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=top_k) tokenizer.fit_on_texts(pre + hyp) # Vectorize training texts. x_hyp = tokenizer.texts_to_sequences(pre) x_pre = tokenizer.texts_to_sequences(hyp) # Get max sequence length. max_length = len(max(x_hyp + x_pre, key=len)) if max_length > max_seq_len: max_length = max_seq_len # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_hyp = sequence.pad_sequences(x_hyp, maxlen=max_length).astype('float32') x_pre = sequence.pad_sequences(x_pre, maxlen=max_length).astype('float32') return x_hyp, x_pre, tokenizer
def vectorize_data(train_data, test_data): """ Construye la secuencias de palabras para cada aviso de propiedad. Las palabras se encuentran indexadas. :param train_data: dataframe que almacena los avisos del conjunto de entrenamiento :param test_data: dataframe que almacena los avisos del conjunto de prueba :return: x_train: lista de las secuencias palabras (índices) de los textos de entrenamiento x_test: lista de las secuencias palabras (índices) de los textos de prueba tokenizer_obj: objeto de la clase Tokenizer """ # Límite del vector de características limit_words = 20000 # Une el nombre y descripcion de una propiedad en un solo texto train_texts = ( train_data['nombre'].apply(lambda x: x if x is not np.nan else ' ') + ' ' + train_data['descripcion'].apply(lambda x: x if x is not np.nan else ' ')) test_texts = ( test_data['nombre'].apply(lambda x: x if x is not np.nan else ' ') + ' ' + test_data['descripcion'].apply(lambda x: x if x is not np.nan else ' ') ) # Vocabulario tokenizer_obj = text.Tokenizer(limit_words) tokenizer_obj.fit_on_texts(train_texts.to_list()) # Vectorización de ambos conjuntos x_train = tokenizer_obj.texts_to_sequences(train_texts.to_list()) x_test = tokenizer_obj.texts_to_sequences(test_texts.to_list()) # Se obtiene la dimensión máxima de un vector y se ajustan los vectores según éste valor max_dimension = max(len(max(x_train, key=len)), len(max(x_test, key=len))) x_train = sequence.pad_sequences(x_train, maxlen=max_dimension) x_test = sequence.pad_sequences(x_test, maxlen=max_dimension) return x_train, x_test, tokenizer_obj
def sequence_vectorize(texts): """Vectorizes texts as sequence vectors. # Arguments train_texts: list, training text strings. # Returns x_train, word_index: vectorized training and validation texts and word index dictionary. """ # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(texts) # Vectorize text. vectors = tokenizer.texts_to_sequences(texts) # Get max sequence length. max_length = len(max(vectors, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # Add padding to sequences. padded_vectors = sequence.pad_sequences(vectors, maxlen=max_length) return padded_vectors, tokenizer.word_index
def create_tokenizer(input_ds, top_words, preprocess_fn=None, key='text', oov_token=None, tokenizer=None): ''' creates keras.preprocessing.text.Tokenizer object based on input dataset, top number of words, and nlp preprocessing functions args: input_ds: list of dicts, each dict has the following key: key: str, default 'text', text that needs to be tokenized, default 'text' top_words: int, top number of words to be tokenized preprocess_fn: function, default, None, the text in the input_ds will be passed into this function to train the tokenizer (in input_ds text will also not be passed into the preprocess_fn) oov_token: str, default None, if not None, token to replace out of vocab words tokenizer: keras.preprocessing.text.Tokenizer object, default None, existing tokenizer object that will get trained on the input_ds, if value is passed: top_words and oov_token args will be ignored returns:keras.preprocessing.text.Tokenizer object ''' word_list = [] #list of texts for item in input_ds: words = item[key] word_list.append(words) if preprocess_fn: preprocessed_words = preprocess_fn(words) word_list.append(preprocessed_words) if tokenizer == None: tokenizer = text.Tokenizer(num_words=top_words, oov_token=oov_token) tokenizer.fit_on_texts(word_list) return tokenizer
def train_and_evaluate(output_dir, hparams): # ensure filewriter cache is clear for TensorBoard events file tf.summary.FileWriterCache.clear() # Load Data ((train_texts, train_labels), (test_texts, test_labels)) = load_review_data(hparams['train_data_path'], hparams['eval_data_path']) # Create vocabulary from training corpus. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) # Generate vocabulary file from tokenizer object to enable # creating a native tensorflow lookup table later (used in vectorize_sentences()) tf.gfile.MkDir( output_dir) # directory must exist before we can use tf.gfile.open global VOCAB_FILE_PATH VOCAB_FILE_PATH = os.path.join(output_dir, 'vocab.txt') with tf.gfile.Open(VOCAB_FILE_PATH, 'wb') as f: f.write("{},0\n".format(PADWORD)) # map padword to 0 for word, index in tokenizer.word_index.items(): if index < TOP_K: # only save mappings for TOP_K words f.write("{},{}\n".format(word, index)) # Create estimator run_config = tf.estimator.RunConfig(save_checkpoints_steps=100, save_summary_steps=100) if hparams['model_type'] == 'CNN': estimator = keras_CNN_estimator( model_dir=output_dir, config=run_config, learning_rate=hparams['learning_rate'], filters=hparams['filters'], dropout_rate=hparams['dropout_rate'], embedding_dim=hparams['embedding_dim'], kernel_size=hparams['kernel_size'], pool_size=hparams['pool_size'], embedding_path=hparams['embedding_path'], word_index=tokenizer.word_index) elif hparams['model_type'] == 'LSTM': estimator = keras_LSTM_estimator( model_dir=output_dir, config=run_config, learning_rate=hparams['learning_rate'], dropout_rate=hparams['dropout_rate'], embedding_dim=hparams['embedding_dim'], embedding_path=hparams['embedding_path'], word_index=tokenizer.word_index) elif hparams['model_type'] == 'BiDirect': estimator = keras_BiDirect_estimator( model_dir=output_dir, config=run_config, learning_rate=hparams['learning_rate'], dropout_rate=hparams['dropout_rate'], embedding_dim=hparams['embedding_dim'], embedding_path=hparams['embedding_path'], word_index=tokenizer.word_index) ### Add evaluating metric #estimator = tf.contrib.estimator.add_metrics(estimator, my_acc) ### Add early stopping early_stopping = tf.estimator.experimental.stop_if_no_decrease_hook( estimator, metric_name='loss', max_steps_without_decrease=1000, min_steps=100) # Create TrainSpec train_steps = hparams['num_epochs'] * len( train_texts) / hparams['batch_size'] train_spec = tf.estimator.TrainSpec( input_fn=lambda: input_fn(train_texts, train_labels, hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), hooks=[early_stopping], max_steps=train_steps) # Create EvalSpec exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: input_fn(test_texts, test_labels, hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), steps=None, exporters=exporter, start_delay_secs=10, throttle_secs=10) # Start training tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train(self, texts: List[str], target: List[int]) -> None: from tensorflow.python.keras.models import Sequential #type: ignore from tensorflow.python.keras.layers import Embedding, Dense, LSTM, GlobalMaxPool1D #type: ignore from tensorflow.keras.optimizers import Adam #type: ignore from tensorflow.keras.callbacks import History #type: ignore if self.downsampling: texts, target = downsample(texts, target, self.downsampling_ratio) if self.verbose: print('1. Vectorizing texts') NUMBER_OF_FEATURES: int = 20000 self.tokenizer = text.Tokenizer(num_words=NUMBER_OF_FEATURES) self.tokenizer.fit_on_texts(texts) vocabulary: Dict[str, int] = self.tokenizer.word_index if self._max_sequence_length == 0: self._max_sequence_length = len(max(texts, key=len)) vectorized_texts: array = self.vectorize_texts(texts) if self.embedding_location == '': if self.verbose: print('2. Skip (no embeddings)') print('3. Skip (no embeddings)') else: if self.verbose: print('2. Loading word embeddings') embedding_dictionary: Dict[ str, List[float]] = load_embedding_dictionary( self.embedding_location) nr_of_embedding_features: int = len( list(embedding_dictionary.values()) [1]) # Check how many values we have for the first word if self.verbose: print('3. Creating embedding matrix') embedding_matrix: array = create_embedding_matrix_for_vocabulary( embedding_dictionary, vocabulary) if self.verbose: print('4. Building up model') #Define a simple LSTM model with a pretrained embedding layer model: Sequential = Sequential() if self.embedding_location == '': #Add an empty embedding layer if we have no pretrained embeddings EMPTY_EMBEDDING_LAYER_SIZE: int = 300 model.add( Embedding(len(vocabulary) + 1, EMPTY_EMBEDDING_LAYER_SIZE)) else: model.add( Embedding(input_dim=len(vocabulary) + 1, output_dim=nr_of_embedding_features, input_length=vectorized_texts.shape[1], weights=[embedding_matrix], trainable=False)) model.add(LSTM(16, return_sequences=True)) model.add(LSTM(16, return_sequences=True)) model.add(LSTM(16, return_sequences=True)) model.add(GlobalMaxPool1D()) model.add(Dense(256)) model.add(Dense(256)) model.add(Dense(1, activation='sigmoid')) #Compile the model optimizer: Adam = Adam(lr=self.learning_rate) model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc']) if self.verbose: print('5. training the model') history: History = model.fit( vectorized_texts, target, epochs=self.learning_epochs, #validation_data=(test_vectors, test_target), verbose=1, # Logs once per epoch. batch_size=self.learning_batch_size) self.model = model