def get_dataset(): (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features) x_train = sequence.pad_sequences(x_train, maxlen=80) ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)) ds = ds.repeat() ds = ds.map(lambda x, y: (x, tf.cast(y, tf.int32))) ds = ds.batch(32, drop_remainder=True) return ds
def tf2_estimator(): from zoo.orca.learn.tf2.estimator import Estimator # import ray init_orca_context(cluster_mode="local", cores=4, memory="3g") print("running tf2 estimator") imdb = keras.datasets.imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000) # print(train_data) word_index = imdb.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256) test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256) model = keras.Sequential() model.add(keras.layers.Embedding(1000, 16)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(16, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) x_val = train_data[:1000] partial_x_train = train_data[1000:] y_val = train_labels[:1000] partial_y_train = train_labels[1000:] train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train)) validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)) est = Estimator.from_keras(model_creator=model) est.fit(data=train_dataset, batch_size=512, epochs=100, validation_data=validation_dataset) results = est.evaluate(validation_dataset) print(results) est.save('work/saved_model') est.get_train_summary(tag='Loss') est.get_validation_summary(tag='Top1Accuracy') stop_orca_context()
def prepare_data(self): """ main prepare data :return: """ (x_train, y_train), (_, _) = imdb.load_data(num_words=self.flags.vocab_size) word_index = self.build_word_index() x_train = pad_sequences(x_train, maxlen=250, value=word_index['<PAD>'], padding='post') (x_train, x_eval) = x_train[:20000], x_train[20000:] (y_train, y_eval) = y_train[:20000], y_train[20000:] train_data, eval_data = self.build_generator(x_train, y_train), self.build_generator(x_eval, y_eval) return train_data, eval_data, len(x_train), len(x_eval)
def prepare_data(self): """ main prepare data :return: """ (_, _), (x_test, y_test) = imdb.load_data(num_words=self.flags.vocab_size) # build word index and reverse word index self.build_word_index() self.build_reverse_word_index() self.x_test = x_test x_test = pad_sequences(x_test, maxlen=250, value=self.word_index['<PAD>'], padding='post') return x_test
def preprocess_data(self, data_dir): print('IMDB_Task preprocess_data') vocab_size = self.configs['vocab_size'] sentence_size = self.configs['max_time'] # we assign the first indices in the vocabulary \ # to special tokens that we use # for padding, as start token, and for indicating unknown words pad_id = 0 start_id = 1 oov_id = 2 index_offset = 2 print("Loading data...") (x_train_variable, y_train), \ (x_test_variable, y_test) = imdb.load_data( num_words=vocab_size, start_char=start_id, oov_char=oov_id, index_from=index_offset) print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) x_len_train = np.array([min(len(x), sentence_size) \ for x in x_train_variable]) x_len_test = np.array([min(len(x), sentence_size) \ for x in x_test_variable]) word_index = imdb.get_word_index() #pdb.set_trace() return PreProcessedData(x_train=x_train, y_train=y_train, x_len_train=x_len_train, x_test=x_test, y_test=y_test, x_len_test=x_len_test, vocab_size=vocab_size, word_index = word_index)
def import_data(): '''Imports the imdb dataset and returns train and test tensors using one-hot encoding. ''' (train_data, train_labels), (test_data, test_labels) =\ imdb.load_data(num_words=10000) x_train = vectorize_sequence(train_data) x_test = vectorize_sequence(test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') return (x_train, y_train), (x_test, y_test)
def __init__(self): num_classes = 1 self.max_features = 5000 self.maxlen = 400 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=self.max_features) x_train = pad_sequences(x_train, maxlen=self.maxlen) x_test = pad_sequences(x_test, maxlen=self.maxlen) super().__init__(x_train, x_test, y_train, y_test, (self.maxlen, ), num_classes, 'imdb')
def __init__(self): self.vocab_size = 5000 self.start_id = 1 self.oov_id = 2 self.index_offset = 2 self.sentence_size = 200 model_dir = tempfile.mkdtemp() print("Loading data...") (self.x_train_variable, self.y_train), (self.x_test_variable, self.y_test) = imdb.load_data( num_words=self.vocab_size, start_char=self.start_id, oov_char=self.oov_id, index_from=self.index_offset) self.x_train = 0 self.x_test = 0 print(len(self.y_train), "train sequences") print(len(self.y_test), "test sequences")
def test_train_ngram(): train, val = imdb.load_data() acc, loss = training.train_ngram(train, val) assert acc == pytest.approx(0.91, 0.02) assert loss == pytest.approx(0.24, 0.02)
def test_fine_tuned_sequence(): train, val = imdb.load_data() acc, loss = tune.fine_tune_sequence(train, val) assert acc == pytest.approx(0.84, 0.02) assert loss == pytest.approx(0.55, 0.02)
def test_fine_tune_ngram(): train, val = imdb.load_data() acc, loss = tune.fine_tune_ngram(train, val) assert acc == pytest.approx(0.61, 0.02) assert loss == pytest.approx(0.89, 0.02)
message = message.translate(str.maketrans('', '', string.punctuation)) tmp = [] for word in message.split(" "): tmp.append(word_to_id[word]) padded_message = sequence.pad_sequences([tmp], maxlen=PAD_MAX_LENGTH) sentiment_prediction = my_model.predict(np.array(padded_message)) return sentiment_prediction # Trainings und Testdaten werden über Keras geladen # Alternativ können Sie direkt die Datei als Pikle Datei herunterladen (x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", num_words=VOCABULARY_SIZE, skip_top=0, maxlen=None, seed=113, start_char=START_CHAR, oov_char=2, index_from=INDEX_FROM) # Die Datei wird imdb_word_index.json heruntergeladen word_to_id = imdb.get_word_index(path="./imdb_word_index.json") # Hier werden die korrekten Indizes mit dem passenden Wort gespeichert, da es eine Index-Verschiebung von +3 gibt (siehe Erklärung in # https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) # Aus: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = START_CHAR # 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()}
def test_train_sequence(): train, val = imdb.load_data() acc, loss = training.train_sequence(train, val) assert acc == pytest.approx(0.68, 0.02) assert loss == pytest.approx(0.82, 0.02)
parser.add_argument('--cluster_mode', type=str, default="local", help='The mode for the Spark cluster. local or yarn.') args = parser.parse_args() cluster_mode = args.cluster_mode if cluster_mode == "local": init_orca_context(cluster_mode="local", cores=4, memory="3g") elif cluster_mode == "yarn": init_orca_context(cluster_mode="yarn-client", num_nodes=2, cores=2, driver_memory="3g", \ conf={"spark.executor.extraJavaOptions": "-Xss512m", "spark.driver.extraJavaOptions": "-Xss512m"}) max_features = 20000 max_len = 200 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) train_pos = np.zeros((len(x_train), max_len), dtype=np.int32) val_pos = np.zeros((len(x_test), max_len), dtype=np.int32) for i in range(0, len(x_train)): train_pos[i, :] = np.arange(max_len) val_pos[i, :] = np.arange(max_len)
tf.logging.set_verbosity(tf.logging.INFO) print(tf.__version__) """### Loading the data Keras provides a convenient handler for importing the dataset which is also available as a serialized numpy array `.npz` file to download [here]( https://s3.amazonaws.com/text-datasets/imdb.npz). Each review consists of a series of word indexes that go from $4$ (the most frequent word in the dataset, **the**) to $4999$, which corresponds to **orange**. Index $1$ represents the beginning of the sentence and the index $2$ is assigned to all unknown (also known as *out-of-vocabulary* or *OOV*) tokens. These indexes have been obtained by pre-processing the text data in a pipeline that cleans, normalizes and tokenizes each sentence first and then builds a dictionary indexing each of the tokens by frequency. We are not convering these techniques in this post, but you can take a look at [this chapter](http://www.nltk.org/book/ch03.html) of the NLTK book to learn more. """ vocab_size = 5000 sentence_size = 200 embedding_size = 50 model_dir = tempfile.mkdtemp() # Should we not use keras and rewrite this logic? print("Loading data...") (x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size) print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, padding='post', value=0) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, padding='post', value=0) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) """We can use the word index map to inspect how the first review looks like."""
#https://github.com/iamved/IMDB-sentiment-analysis/blob/master/IMDB_Sentiment_Analysis.ipynb #https://github.com/balag59/imdb-sentiment-bidirectional-LSTM/blob/master/imdb_bilstm_train.py from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras.preprocessing import sequence from keras import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, TimeDistributed import numpy as np max_features = 10000 max_length = 500 (train_data, train_label), (test_data, test_labels) = imdb.load_data(num_words=max_features) train_data = sequence.pad_sequences(train_data, maxlen=max_length) test_data = sequence.pad_sequences(test_data, maxlen=max_length) train_data = np.concatenate((train_data, test_data[:15000])) test_data = test_data[15000:] train_label = np.concatenate((train_label, test_labels[:15000])) test_labels = test_labels[15000:] embedding_size = 128 network = Sequential() network.add(Embedding(max_features, embedding_size, input_length=max_length)) network.add(Bidirectional(LSTM(embedding_size, return_sequences=True))) #network.add(Dropout(0.2)) network.add(Bidirectional(LSTM(embedding_size, return_sequences=True))) #network.add(Dropout(0.2))
def tf_estimator(): from zoo.orca.learn.tf.estimator import Estimator init_orca_context(cluster_mode="local", cores=4, memory="3g") os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE' print("running tf estimator") imdb = keras.datasets.imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000) # print(train_data) word_index = imdb.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256) test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256) model = keras.Sequential() model.add(keras.layers.Embedding(1000, 16)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(16, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) x_val = train_data[:1000] partial_x_train = train_data[1000:] y_val = train_labels[:1000] partial_y_train = train_labels[1000:] train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train)) validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)) est = Estimator.from_keras(keras_model=model) est.set_constant_gradient_clipping(0.1, 0.2) est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset) results = est.evaluate(validation_dataset) print(results) est.clear_gradient_clipping() est.set_l2_norm_gradient_clipping(0.1) est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset) results = est.evaluate(validation_dataset) print(results) est.save('work/saved_model') print("save API finished") # est.save_tf_checkpoint('work/checkpoint') # est.load_tf_checkpoint('work/checkpoint') print("checkpoint save and load API finished") est.save_keras_model('work/keras_model') est.save_keras_weights('work/keras_weights') print("keras model and weights save API finished") # est.load_keras_model('work/keras_model') # est.load_keras_weights('work') print("keras model and weights load API finished") est.get_train_summary(tag='Loss') est.get_validation_summary(tag='Top1Accuracy') # Estimator.load(est, model_path='work/') # Has not been implemented # resutls = est.predict(validation_dataset) # print(results) stop_orca_context()
import tensorflow as tf from tensorflow.python.keras.datasets import imdb import numpy from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense from tensorflow.python.keras.layers import LSTM from tensorflow.python.keras.layers import Conv1D from tensorflow.python.keras.layers import MaxPooling1D from tensorflow.python.keras.layers import Embedding from tensorflow.python.keras.preprocessing import sequence numpy.random.seed(7) top_words = 5000 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) embedding_vecor_length = 32 model = Sequential() model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(LSTM(100)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary())
tf.logging.set_verbosity(tf.logging.INFO) print(tf.__version__) """### Loading the data Keras provides a convenient handler for importing the dataset which is also available as a serialized numpy array `.npz` file to download [here]( https://s3.amazonaws.com/text-datasets/imdb.npz). Each review consists of a series of word indexes that go from $4$ (the most frequent word in the dataset, **the**) to $4999$, which corresponds to **orange**. Index $1$ represents the beginning of the sentence and the index $2$ is assigned to all unknown (also known as *out-of-vocabulary* or *OOV*) tokens. These indexes have been obtained by pre-processing the text data in a pipeline that cleans, normalizes and tokenizes each sentence first and then builds a dictionary indexing each of the tokens by frequency. We are not convering these techniques in this post, but you can take a look at [this chapter](http://www.nltk.org/book/ch03.html) of the NLTK book to learn more. It's standard to limit the size of the vocabulary to prevent the dataset from becoming too sparse and high dimensional, causing potential overfitting. After we've loaded the data in memory we pad each of the sentences with $-1$ to a fixed size (here: $200$) so that we have two $2$-dimensional $25000\times200$ arrays for training and testing respectively. """ vocab_size = 5000 sentence_size = 200 embedding_size = 50 model_dir = tempfile.mkdtemp() # Should we not use keras and rewrite this logic? print("Loading data...") (x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data( num_words=vocab_size) print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, padding='post', value=0) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, padding='post', value=0) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape)
def load_imdb(max_features=20000, maxlen=1000): (X_train, y_train), (X_val, y_val) = _imdb.load_data(num_words=max_features) X_train = csr_matrix(pad_sequences(X_train, maxlen=maxlen)) X_val = csr_matrix(pad_sequences(X_val, maxlen=maxlen)) return (X_train, y_train), (X_val, y_val)
from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense, Embedding, Flatten from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras import preprocessing path = r"F:\5-model data\imdb.npz" max_feature = 10000 maxlen = 20 (x_trian, y_train), (x_test, y_test) = imdb.load_data(path=path, num_words=max_feature) x_trian = preprocessing.sequence.pad_sequences(x_trian, maxlen=maxlen) x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen) model = Sequential() model.add(Embedding(10000, 8, input_length=maxlen)) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) model.summary() model.fit(x_trian, y_train, epochs=10, batch_size=32, validation_split=0.2)
from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras.preprocessing import sequence max_features = 10000 maxlen = 500 batch_size = 32 print('Lodaing data...') imdb_path = r"F:\5-model data\imdb.npz" (input_train,y_train),(input_test,y_test) = imdb.load_data( path=imdb_path, num_words=max_features ) print(len(input_train),'train sequences') print(len(input_test),'test sequences') print('pad sequences (samples x time)') input_train = sequence.pad_sequences(input_train,maxlen=maxlen) input_test = sequence.pad_sequences(input_test,maxlen=maxlen) #用Embedding层和SimpleRNN层来训练 from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Embedding,SimpleRNN,Dense model = Sequential() model.add(Embedding(max_features,32)) model.add(SimpleRNN(32)) model.add(Dense(1,activation='sigmoid')) model.summary() model.compile( optimizer='rmsprop', loss= 'binary_crossentropy', metrics=['acc'] )
model.add(GlobalMaxPooling1D()) model.add(Dense(1)) model.summary() model.compile(optimizer=RMSprop(lr=1e-4), loss='binary_crossentropy', metrics=['acc']) return model if __name__ == '__main__': max_features = 10000 maxlen = 500 batch_size = 32 print('Loading data...') (input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features) print(len(input_train), 'train sequences') print(len(input_test), 'test sequences') print('Pad sequences (samples x time)') input_train = sequence.pad_sequences(input_train, maxlen=maxlen) input_test = sequence.pad_sequences(input_test, maxlen=maxlen) print('input_train shape:', input_train.shape) print('input_test shape:', input_test.shape) # rnn训练速度极慢 # model = build_simple_rnn_model() #CuDNNLSTM, 才能使用gpu提高速度,普通LSTM很慢 model = build_LSTM_model() # 训练速度较快 # model = build_cnn_1d_model()
from input_functions import train_input_fn, eval_input_fn from utils import load_glove_embeddings, load_data from models import cnn_model_fn, lstm_model_fn tf.logging.set_verbosity(tf.logging.INFO) print(tf.__version__) vocab_size = 5000 sentence_size = 200 embedding_size = 50 #model_dir = tempfile.mkdtemp() model_dir = 'model' project_path = os.path.dirname(os.path.abspath(__file__)) (x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(path=os.path.join(project_path, 'data/imdb.npz'), num_words=vocab_size) print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, padding='post', value=0) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, padding='post', value=0) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape)
# from github susanli2016 #https://github.com/andyngo95/SA_Positive_Negative_Comments/blob/master/Sentiment_Analysis_v2.ipynb #https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39 #https://github.com/thushv89/attention_keras from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras.preprocessing import sequence from keras import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout #from attention_keras.layers.attention import AttentionLayer import numpy as np max_features = 10000 max_length = 500 (train_data, train_label), (test_data, test_labels) = imdb.load_data(num_words=max_features) train_data = sequence.pad_sequences(train_data, maxlen=max_length) test_data = sequence.pad_sequences(test_data, maxlen=max_length) train_data = np.concatenate((train_data, test_data[:15000])) test_data = test_data[15000:] train_label = np.concatenate((train_label, test_labels[:15000])) test_labels = test_labels[15000:] embedding_size = 128 model = Sequential() model.add(Embedding(max_features, embedding_size, input_length=max_length)) model.add(LSTM(embedding_size, return_sequences=True)) model.add(Dropout(0.2)) model.add(LSTM(embedding_size)) model.add(Dropout(0.2))
# raise SystemError('GPU device not found') else: print('Found GPU at: {}'.format(device_name)) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False sess = tf.Session(config=config) tf.keras.backend.set_session(sess) # # save np.load # np_load_old = np.load # # modify the default parameters of np.load # np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k) num_words = 20000 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words) epoch = 5 batch_size = 64 word_to_index = imdb.get_word_index() word_to_index = {key:(value+3) for key,value in word_to_index.items()} word_to_index["<PAD>"] = 0 word_to_index["<START>"] = 1 word_to_index["<UNK>"] = 2 index_to_word = {value:key for key,value in word_to_index.items()} def print_sentence(id_list): print(' '.join([index_to_word[id] for id in id_list if id != 0])) print("Train-set size: ", len(x_train))
import numpy as np from keras.layers import Dense, Input, concatenate from keras.optimizers import SGD from keras.models import Model from tensorflow.python.keras.datasets import imdb word_num = 6666 (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=word_num) def vectorize_sequences(sequences, dimension=word_num): # one-hot results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) y_train = np.asarray(train_labels).astype('float32') # 向量化标签数据 y_test = np.asarray(test_labels).astype('float32') x_val = x_train[:2000] partial_x_train = x_train[2000:] y_val = y_train[:2000]
from tensorflow.python.keras.datasets import imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # print(train_data) # word_index = imdb.get_word_index() # reverse_word_index = dict( # [(value,key) for (key,value) in word_index.items()] # ) # decoded_revied = ' '.join([reverse_word_index.get(i-3,"?") for i in train_data[0]]) # # print(reverse_word_index) # # print(decoded_revied) import numpy as ny def vectorize_sequences(sequences, dimension=10000): results = ny.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): #enumerate枚举,遍历对象 results[i, sequence] = 1. return results # #把训练数据和测试数据向量化 x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) # print(x_train,x_test) #将标签向量化 y_train = ny.asarray(train_labels).astype('float32') y_test = ny.asarray(test_labels).astype('float32') #留出验证集
print(tf.__version__) vocab_size = 5000 sentence_size = 200 embedding_size = 50 model_dir = tempfile.mkdtemp() pad_id = 0 start_id = 1 oov_id = 2 index_offset = 2 print("Loading data...") (x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size, start_char=start_id, oov_char=oov_id, index_from=index_offset) print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, truncating='post', padding='post', value=pad_id)
def bigdl_estimator(): from zoo.orca.learn.bigdl.estimator import Estimator from tensorflow.python.keras.datasets import imdb from tensorflow.python.keras.preprocessing import sequence from zoo.pipeline.api.keras.models import Model from zoo.pipeline.api.keras.objectives import SparseCategoricalCrossEntropy from zoo.orca.data import XShards from zoo.orca.learn.metrics import Accuracy import numpy as np # conf = {"spark.executor.extraJavaOptions": "-Xss512m", "spark.driver.extraJavaOptions": "-Xss512m"} # init_orca_context(cluster_mode="local", cores=8, memory="16g") init_orca_context(cluster_mode="local", cores=4, memory="16g") max_features = 200 max_len = 20 print("running bigdl estimator") (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) x_train = x_train[:1000] y_train = y_train[:1000] x_test = x_test[-1000:] y_test = y_test[-1000:] print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) train_pos = np.zeros((len(x_train), max_len), dtype=np.int32) val_pos = np.zeros((len(x_test), max_len), dtype=np.int32) for i in range(0, len(x_train)): train_pos[i, :] = np.arange(max_len) val_pos[i, :] = np.arange(max_len) train_dataset = XShards.partition({"x": (x_train, train_pos), "y": np.array(y_train)}) val_dataset = XShards.partition({"x": (x_test, val_pos), "y": np.array(y_test)}) token_shape = (max_len,) position_shape = (max_len,) token_input = Input(shape=token_shape) position_input = Input(shape=position_shape) O_seq = TransformerLayer.init(vocab=max_features, hidden_size=128, n_head=8, seq_len=max_len)([token_input, position_input]) # Select the first output of the Transformer. The second is the pooled output. O_seq = SelectTable(0)(O_seq) O_seq = GlobalAveragePooling1D()(O_seq) O_seq = Dropout(0.2)(O_seq) outputs = Dense(2, activation='softmax')(O_seq) model = Model([token_input, position_input], outputs) model.summary() batch_size = 64 print("Train started") est = Estimator.from_bigdl(model=model, loss=SparseCategoricalCrossEntropy(), optimizer=Adam(), metrics=[Accuracy()]) est.set_constant_gradient_clipping(0.1, 0.2) est.fit(data=train_dataset, batch_size=batch_size, epochs=1) result = est.evaluate(val_dataset) print(result) est.clear_gradient_clipping() est.set_l2_norm_gradient_clipping(0.5) est.fit(data=train_dataset, batch_size=batch_size, epochs=1) print("Train finished") print("Evaluating started") result = est.evaluate(val_dataset) print(result) print("Evaluating finished") est.save('work/saved_model') # est.load('work/saved_model') print("load and save API finished") est.get_train_summary(tag='Loss') est.get_validation_summary(tag='Top1Accuracy') print("get summary API finished") stop_orca_context()