from numpy.random import seed seed(1) from tensorflow import set_random_seed set_random_seed(2) from tensorflow import keras from tensorflow.keras.datasets import imdb from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional import pandas from sklearn.preprocessing import LabelEncoder, StandardScaler import numpy (x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", num_words=None, skip_top=0, maxlen=666, seed=113, start_char=1, oov_char=2, index_from=3) x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=666) word_to_id = imdb.get_word_index() word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} print(' '.join(id_to_word[id] for id in x_train[0])) model = keras.Sequential()
from tensorflow.keras.datasets import imdb # set parameters: wandb.init() config = wandb.config config.vocab_size = 1000 config.maxlen = 1000 config.batch_size = 32 config.embedding_dims = 50 config.filters = 250 config.kernel_size = 3 config.hidden_dims = 250 config.epochs = 10 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=config.vocab_size) X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen) X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen) print(X_train.shape) print("After pre-processing", X_train[0]) # overide LSTM & GRU if 'GPU' in str(device_lib.list_local_devices()): print("Using CUDA for RNN layers") LSTM = CuDNNLSTM GRU = CuDNNGRU model = tf.keras.models.Sequential() model.add( tf.keras.layers.Embedding(config.vocab_size,
maxlen = 400 embedding_dims = 100 epochs = 10 batch_size = 256 max_features = 5000 MODEL_NAME = 'TextBiRNN-epoch-10-emb-100-avg2-dense2' use_early_stop = True tensorboard_log_dir = 'logs\\{}'.format(MODEL_NAME) # checkpoint_path = "save_model_dir\\{}\\cp-{epoch:04d}.ckpt".format(MODEL_NAME, '') checkpoint_path = 'save_model_dir\\' + MODEL_NAME + '\\cp-{epoch:04d}.ckpt' # ==================================================================== print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print('Pad sequences (samples x time)...') x_train = pad_sequences(x_train, maxlen=maxlen, padding='post') x_test = pad_sequences(x_test, maxlen=maxlen, padding='post') print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) model_hepler = ModelHepler(class_num=class_num, maxlen=maxlen, max_features=max_features, embedding_dims=embedding_dims, epochs=epochs, batch_size=batch_size) model_hepler.get_callback(use_early_stop=use_early_stop, tensorboard_log_dir=tensorboard_log_dir,
from tensorflow.keras import models from tensorflow.keras import layers import matplotlib.pyplot as plt #랜덤 시드 np.random.seed(0) number_of_features = 10000 np_load_old = np.load # modify the default parameters of np.load np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k) # call load_data with allow_pickle implicitly set to true (data_train, target_train), (data_test, target_test) = imdb.load_data(num_words=number_of_features) #영화영화 리뷰 데이터를 원-핫 인코딩된 특성 행렬로 변환 tokenizer = Tokenizer(num_words=number_of_features) features_train = tokenizer.sequences_to_matrix(data_train, mode="binary") features_test = tokenizer.sequences_to_matrix(data_test, mode="binary") # 신경망 모델을 만듭니다. network = models.Sequential() # 입력층으로 드롭아웃 층을 추가합니다. network.add(layers.Dropout(0.3, input_shape=(number_of_features,))) # 렐루 활성화 함수를 사용한 완전 연결 층을 추가합니다.
To make this tutorial easy to follow, we just treat IMDB dataset as a regression dataset. It means we will treat prediction targets of IMDB dataset, which are 0s and 1s as numerical values, so that they can be directly used as the regression targets. ## A Simple Example The first step is to prepare your data. Here we use the [IMDB dataset](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) as an example. """ import numpy as np from tensorflow.keras.datasets import imdb # Load the integer sequence the IMDB dataset with Keras. index_offset = 3 # word index offset (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000, index_from=index_offset) y_train = y_train.reshape(-1, 1) y_test = y_test.reshape(-1, 1) # Prepare the dictionary of index to word. word_to_id = imdb.get_word_index() word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} # Convert the word indices to words. x_train = list(map(lambda sentence: ' '.join( id_to_word[i] for i in sentence), x_train)) x_test = list(map(lambda sentence: ' '.join( id_to_word[i] for i in sentence), x_test)) x_train = np.array(x_train, dtype=np.str)
args = parser.parse_args() # Set seeds for reproducibility tf.random.set_seed(500) # Set global constants vocabulary_size = 10000 # choose 10k most-used words for truncated vocabulary sequence_length = 500 # choose 500-word sequences, either pad or truncate sequences to this embedding_dims = 50 # number of dimensions to represent each word in vector space batch_size = 100 # feed in the neural network in 100-example training batches num_epochs = 10 # number of times the neural network goes over EACH training example config = int(args.config) # model configuration # Load the IMDB dataset for sentiment classification (X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=vocabulary_size) # Pad & truncate sequences to fixed sequence length X_train = pad_sequences(sequences=X_train, maxlen=sequence_length) X_test = pad_sequences(sequences=X_test, maxlen=sequence_length) # Create word-level binary sentiment classification model # Input Layer X = Input(shape=(sequence_length, ), batch_size=batch_size) # Word-Embedding Layer embedded = Embedding(input_dim=vocabulary_size, output_dim=embedding_dims)(X) # Optional Self-Attention Mechanisms if config == 1: embedded, attention_weights = SelfAttention(
from tensorflow import keras from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt (train_input, train_target), (test_input, test_target) = imdb.load_data(num_words=500) train_input, val_input, train_target, val_target = train_test_split( train_input, train_target, test_size=0.2, random_state=2021) train_seq = pad_sequences(train_input, maxlen=100) val_seq = pad_sequences(val_input, maxlen=100) model = keras.Sequential() model.add(keras.layers.Embedding(500, 16, input_length=100)) model.add(keras.layers.LSTM(8, dropout=0.3)) model.add(keras.layers.Dense(1, activation="sigmoid")) rmsprop = keras.optimizers.RMSprop(learning_rate=1e-4) model.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=["accuracy"]) checkpoint = keras.callbacks.ModelCheckpoint("best_lstm.h5") stoppoint = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True) history = model.fit(train_seq, train_target, epochs=100, batch_size=64, validation_data=(val_seq, val_target),
parser.add_argument("--n_cells", required=False, default=32) parser.add_argument("--batch_size", required=False, default=32) parser.add_argument("--learning_rate", required=False, default=0.01) args = parser.parse_args() print("A kind of train word: ", args.num_words) print("A kind of non train word: ", args.skip_top) print("The number of cells: ", args.n_cells) print("Batch size: ", args.batch_size) print("Learning rate: ", args.learning_rate) # Load tensorflow keras imdb dataset. print("Loading...") (x_train_all, y_train_all), (x_test, y_test) = \ imdb.load_data(skip_top=args.skip_top, num_words=args.num_words) # Remove all paddings 0, indents 1, no words 2. for i, sample in enumerate(x_train_all): x_train_all[i] = [n for n in sample if n > 2] # Load a dictionary. word2index = imdb.get_word_index() index2word = {word2index[k]: k for k in word2index} # Shffle dataset. np.random.seed(42) random_index = np.random.permutation(25000) x_train = x_train_all[random_index[:20000]] y_train = y_train_all[random_index[:20000]] x_val = x_train_all[random_index[20000:]]
for key, word in vocabulary_inv.items()} return embedding_weights # Prepossessing parameters sequence_length = 400 max_words = 5000 # Word2Vec parameters min_word_count = 1 context = 10 imdb = tf.keras.datasets.imdb (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000,start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) #teacher-predictions #teacher = 'lstm' #def get_teacher_predictions(teacher): # if teacher == 'lstm':
return lemmatizer.lemmatize(token, 'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token, 'v') else: return lemmatizer.lemmatize(token, 'n') pre_proc_text = " ".join( [prat_lemmatize(token, tag) for token, tag in tagged_corpus]) return pre_proc_text # IMDB 데이터에 사용된 총 단어의 종류는 88,584개 (vocabulary 크기)이다. # 가장 많이 사용되는 6,000개 단어만 사용하고, 나머지는 OOV로 표시한다. (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=6000, start_char=0, oov_char=0, index_from=0) # train, test 데이터를 합친다. 필요한 경우 나중에 나눠쓴다. text = np.hstack([x_train, x_test]) label = np.hstack([y_train, y_test]) # vocabulary를 가져온다. word2idx = imdb.get_word_index() idx2word = dict((v, k) for k, v in word2idx.items()) # start_char와 oov_char는 '.'으로 표시해 둔다. 나중에 전처리 과정에서 제거된다. idx2word[0] = '.' # 숫자로 표시된 x_train을 실제 단어로 변환한다.
def run_imdb(): # Extract useful data from dataset print('Extracting the IMDB dataset') (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Illustration of the input data print( f'In this dataset a label of 1 indicates a positive review, 0 a negative review.\nHaving taken the top 10,000' f' most-used words no word index will exceed 10,000.\nMax Index = ' f'{max([max(sequence) for sequence in train_data])}') print( f"For the sake of illustration, let's decode a review back to English (not being printed for easier reading)") word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # print(decoded_review) # Encoding the inputs print("In order to pass these lists of integers into a neural network we must first encode them as tensors of " "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.") def vectorise_sequences(sequences, dimension=10000): ret = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): ret[i, sequence] = 1 if i < 1: print(f"\n{sequence} => {ret[i]}\n") return ret x_train = vectorise_sequences(train_data) y_train = np.asarray(train_labels).astype('float32') x_test = vectorise_sequences(test_data) y_test = np.asarray(test_labels).astype('float32') # Design and compile the model print("Now to build the network, this time using parameters with greater configurability") model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000,))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy', metrics=[metrics.binary_accuracy]) # Divide the training data print("Creating a validation set for greater insight during training") x_val = x_train[:10000] # Taking the 1st 10000 samples for validation partial_x_train = x_train[10000:] # Leaving everything from 10000 onwards for training y_val = y_train[:10000] # Taking the 1st 10000 labels for validation partial_y_train = y_train[10000:] # Leaving everything from 10000 onwards for training # Train the model print("Begin training the model:") history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) history_dict = history.history print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. " f"The keys are: {history_dict.keys()}") # ['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy'] # Prepare to plot the training and validation information loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] acc_values = history_dict['binary_accuracy'] val_acc_values = history_dict['val_binary_accuracy'] epochs = range(1, len(history_dict['binary_accuracy']) + 1) plt.plot(epochs, loss_values, 'bo', label='Training Loss') plt.plot(epochs, val_loss_values, 'b', label='Validation Loss') plt.title('Training and Validation Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() plt.plot(epochs, acc_values, 'bo', label='Training Accuracy') plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy') plt.title('Training and Validation Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # Evaluate the model print("\nAfter reviewing each plot, evaluate the performance of the model on new data") results = model.evaluate(x_test, y_test) print(f"Evaluation Results: Loss = {results[0]} Accuracy = {results[1] * 100}%")
import tensorflow as tf import numpy as np from tensorflow.keras.datasets import imdb from tensorflow.keras.utils import to_categorical from tensorflow.keras.preprocessing.sequence import pad_sequences vocab_size = 10000 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size) print('리뷰의 최대 길이 : {}'.format(max(len(l) for l in X_train))) print('리뷰의 평균 길이 : {}'.format(sum(map(len, X_train)) / len(X_train))) max_len = 500 X_train = pad_sequences(X_train, maxlen=max_len) X_test = pad_sequences(X_test, maxlen=max_len) y_train = to_categorical(y_train) y_test = to_categorical(y_test) print(np.shape(X_train), np.shape(y_train))
default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--vocab-size', type=int, default=2000, help='Max size of the vocabulary (default: 2000)') parser.add_argument('--max-len', type=int, default=250, help='Sequence max length (default: 80)') args = parser.parse_args() print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=args.vocab_size, maxlen=args.max_len) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=args.max_len) x_test = sequence.pad_sequences(x_test, maxlen=args.max_len) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Load ONNX model...') onnx_model = onnx.load(args.model_path) #onnx.checker.check_model(onnx_model) print('Convert ONNX to Keras...')
print(e) except: print( 'Static mode selected but no memory limit set. Please set a memory limit by adding the flag -gm=X (gb) or --gpumemory=x (gb) after -m=s or --memory=s' ) quit() else: physical_devices = tf.config.experimental.list_physical_devices('GPU') for physical_device in physical_devices: tf.config.experimental.set_memory_growth(physical_device, True) #%% max_features = 2000 max_len = 500 (X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=max_features) X_train = sequence.pad_sequences(X_train, maxlen=max_len) X_test = sequence.pad_sequences(X_test, maxlen=max_len) model = tf.keras.models.Sequential() model.add( layers.Embedding(max_features, 128, input_length=max_len, name='embed')) model.add(layers.Conv1D(32, 7, activation='relu')) model.add(layers.MaxPooling1D(5)) model.add(layers.Conv1D(32, 7, activation='relu')) model.add(layers.GlobalMaxPooling1D()) model.add(layers.Dense(1)) model.summary() model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing import sequence from tensorflow.keras.utils import to_categorical vocab_size = 5000 max_len = 256 model_dim = 512 # Embedding后词向量大小 batch_size = 128 epochs = 10 print("Data downloading and pre-processing ... ") (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=max_len, num_words=vocab_size) x_train = sequence.pad_sequences(x_train, maxlen=max_len, padding='post') x_test = sequence.pad_sequences(x_test, maxlen=max_len, padding='post') x_train_masks = tf.equal(x_train, 0) x_test_masks = tf.equal(x_test, 0) y_train = to_categorical(y_train) y_test = to_categorical(y_test) print('Model building ... ') inputs = Input(shape=(max_len, ), name="inputs") masks = Input(shape=(max_len, ), name='masks') embeddings = Embedding(vocab_size, model_dim, embeddings_initializer='uniform', input_length=max_len,
# Import TensorFlow and TensorFlow Eager import tensorflow as tf import tensorflow.contrib.eager as tfe import numpy as np import matplotlib.pyplot as plt import os from tensorflow.keras.preprocessing import sequence from tensorflow.keras.datasets import imdb tfe.enable_eager_execution() # 25000 条训练数据和 25000 条测试数据 vocab_size = 10000 (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size) # Dict,键为词,值为词对应的标号 word_index = imdb.get_word_index() # Dict,键为序号,值为词. 注意0是一个特殊的键,代表Unknown word index_word = dict([(value, key) for (key, value) in word_index.items()]) # print("LEN:", len(set(index_word.keys()))) # print(index_word[int(2)]) # We decode the review; note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". decoded_review = ' '.join([index_word.get(i - 3, '?') for i in train_data[10]]) # print(train_data[10]) # print(decoded_review) # 全部句子截断或延长为80个词长的句子
from tensorflow.keras.datasets import imdb import numpy as np import matplotlib.pyplot as plt from tuto_utils import util_func # decoder def decode_review(index_review): word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # i - 3 because 0, 1, 2 are reserved indices for "padding", "start of sequence" and "unknown" return ' '.join([reverse_word_index.get(i - 3, '?') for i in index_review]) # imdb ds, label 0 is neg, 1 is pos (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # np seems to allow np.array[i, []] = 1 type of assign, cool ! def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results x_train, x_test = vectorize_sequences(train_data), vectorize_sequences(test_data) y_train, y_test = np.asarray(train_labels).astype('float32'), np.asarray(test_labels).astype('float32') epochs = 5 model = models.Sequential()
else: result.append(0) return result # 模型评估 def accuracy(self, y, y_pred): right = 0 for i in range(len(y)): if y[i] == y_pred[i]: right += 1 return right / len(y) if __name__ == "__main__": # 引入IMDB电影评论数据做二分类 (train_data, train_labels), (test_data, test_labels) = imdb.load_data( num_words=1000) # 保留训练数据中前1000 个最常出现的单词 # 填充列表,使其具有相同的长度,做one-hot编码 def vectorize_sequences(sequences, dimension=1000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) # 调用LR类 LR = LogisticRegressionClassifier() # 训练模型 LR.fit(x_train, train_labels)
# train it! Just because a model performs well on its training data doesn't mean that it will perform well on data it has never seen, and # what you actually care about is your model's performance on new data (since you already know the labels of your training data -- obviously # you don't need your model to predict those). For instance, it is possible that your model could end up merely _memorizing_ a mapping between # your training samples and their targets -- which would be completely useless for the task of predicting targets for data never seen before. # We will go over this point in much more detail in the next chapter. # # Just like the MNIST dataset, the IMDB dataset comes packaged with Keras. It has already been preprocessed: the reviews (sequences of words) # have been turned into sequences of integers, where each integer stands for a specific word in a dictionary. # # The following code will load the dataset (when you run it for the first time, about 80MB of data will be downloaded to your machine): # %% from tensorflow.keras.datasets import imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # %% [markdown] # # The argument `num_words=10000` means that we will only keep the top 10,000 most frequently occurring words in the training data. Rare words # will be discarded. This allows us to work with vector data of manageable size. # # The variables `train_data` and `test_data` are lists of reviews, each review being a list of word indices (encoding a sequence of words). # `train_labels` and `test_labels` are lists of 0s and 1s, where 0 stands for "negative" and 1 stands for "positive": # %% train_data[0] # %% train_labels[0]
import numpy import tensorflow as tf from tensorflow.keras.datasets import imdb from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.layers import LSTM from tensorflow.keras.layers import Embedding from tensorflow.keras.preprocessing import sequence # fix random seed for reproducibility numpy.random.seed(7) # load the dataset but only keep the top n words, zero the rest top_words = 5000 (X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb.npz", num_words=top_words) # Truncates and pads input sequences. max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) class SequenceClassifier(tf.keras.Model): def __init__(self, num_words, vector_size, batch_size): super(SequenceClassifier, self).__init__() self._embedding = Embedding(num_words, vector_size, batch_input_shape=(batch_size, None)) self._lstm = tf.keras.layers.LSTM(10, return_state=True) self._dense = Dense(1, activation='sigmoid')
import tensorflow as tf import numpy as np from tensorflow.keras.datasets import imdb number_of_words = 20000 max_len = 100 # load dataset # https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded-when-allow-pickle-false-for-imdb-loa/56062555 # np_load_old = np.load # np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k) (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words) # np.load = np_load_old # padding X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len) X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len) # RNN model = tf.keras.Sequential() model.add( tf.keras.layers.Embedding(input_dim=number_of_words, output_dim=128, input_shape=(X_train.shape[1], ))) # tf.keras.layers.CuDNNLSTM model.add(tf.keras.layers.LSTM(units=128, activation='tanh'))
word2id = imdb.get_word_index() # dictionary from words to integers (the id of the word in the vocab) id2word = {i: word for word, i in word2id.items()} # Embedding matrix holds the vector representation for the words. embedding_matrix = np.zeros((vocabulary_size, 100)) # 50000 for word, index in word2id.items(): if index > vocabulary_size - 1: continue else: embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector # for i in range(0, 3): # print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[i], embedding_matrix[i])) (X_train_full, y_train_full), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size) train_size = len(X_train) X_train_full = pad_sequences(X_train_full, maxlen=1000) X_train, X_valid = X_train_full[5000:], X_train_full[:5000] y_train, y_valid = y_train_full[5000:], y_train_full[:5000] X_test = pad_sequences(X_test, maxlen=1000) # activation_choice = 'tanh' # drop_out_choice = 0.0 # optimizer_choice = 'adam' # train_batch_size_choice = 32 # epochs_choice = 5 def part3(num_epochs, train_batch_size_choice, activation_choice, optimizer_choice, drop_out_choice):
import numpy as np import matplotlib.pyplot as plt import tensorflow as tf from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing import sequence from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Conv1D, MaxPooling1D np.random.seed(0) tf.random.set_seed(0) # data load (X_tn0, y_tn0), (X_te0, y_test) = imdb.load_data(num_words = 2000) # 주로 사용하는 단어만 사용 print(X_tn0.shape) #(25000,) print(y_tn0.shape) #(25000,) print(X_te0.shape) #(25000,) print(y_test.shape) #(25000,) X_train = X_tn0[:20000] y_train = y_tn0[:20000] X_valid = X_tn0[20000:25000] y_valid = y_tn0[20000:25000] print(X_train[0]) print(len(X_train[0])) #218 print(len(X_train[1])) #189 print(set(y_test)) print(len(set(y_test))) #2 # pre-processing
import matplotlib.pyplot as plt import numpy as np from tensorflow.keras import Sequential, regularizers from tensorflow.keras.datasets import imdb from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam (X_train, y_train), (X_test, y_test) = imdb.load_data() (training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=500) data = np.concatenate((training_data, testing_data), axis=0) targets = np.concatenate((training_targets, testing_targets), axis=0) index = imdb.get_word_index() reverse_index = dict([(value, key) for (key, value) in index.items()]) decoded = " ".join([reverse_index.get(i - 3, "#") for i in data[0]]) print(decoded) def plot_loss(loss, v_loss): plt.figure(1, figsize=(8, 5)) plt.plot(loss, 'b', label='train') plt.plot(v_loss, 'r', label='validation') plt.title('Loss') plt.ylabel('loss') plt.xlabel('epochs') plt.legend() plt.show() plt.clf()
coded = [-2] coded.extend([index.get(i, 0) for i in txt]) for i in range(len(coded)): if coded[i]: coded[i] += 3 if coded[i] >= idim: coded[i] = 2 print(coded) return coded num = 10000 (training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=num) data = np.concatenate((training_data, testing_data), axis=0) targets = np.concatenate((training_targets, testing_targets), axis=0) data = vectorize(data, num) targets = np.array(targets).astype("float32") test_x = data[:10000] test_y = targets[:10000] train_x = data[10000:] train_y = targets[10000:] model = models.Sequential() model.add(layers.Dense(50, activation="relu", input_shape=(num, ))) model.add(layers.Dropout(0.5))
@author: santc """ ## Etapa 2: Importação das bibliotecas import tensorflow as tf from tensorflow.keras.datasets import imdb tf.__version__ ## Etapa 3: Pré-processamento ### Configurando os parâmetros para a base de dados number_of_words = 20000 max_len = 100 ### Carregando a base de dados IMDB (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words) X_train.shape X_train X_train[0] #Base de dados original com os textos: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews y_train ### Preenchimento das sequências (textos) para terem o mesmo tamanho len(X_train[0]) len(X_train[1]) X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len) len(X_train[0]) len(X_train[1]) X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)
import numpy as np from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing import sequence from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Embedding, SimpleRNN seed = 10 np.random.seed(seed) # 指定亂數種子 # 載入 IMDb 資料集 top_words = 1000 (X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=top_words) # 資料預處理 max_words = 100 X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) # 定義模型 model = Sequential() model.add(Embedding(top_words, 32, input_length=max_words)) model.add(Dropout(0.25)) model.add(SimpleRNN(32)) model.add(Dense(256, activation="relu")) model.add(Dropout(0.25)) model.add(Dense(1, activation="sigmoid")) model.summary() # 顯示模型摘要資訊 # 編譯模型 model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"]) # 訓練模型 history = model.fit(X_train, Y_train,
args = modelParams() args.model_name = modelName args.save_dir = modelName + "/" githubFolder = modelName + "/" if not os.path.exists(githubFolder): os.makedirs(githubFolder) args.githubFolder = githubFolder #Read Data #X_train, X_validation, X_test, y_train, y_validation, y_test, num_classes = readPartialData(taskName,0.25) (X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb_full.pkl", nb_words=None, skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3) X_val = X_test[0:10000] X_test = X_test[10000:] y_val = y_test[0:10000] y_test = y_test[10000:] X = np.concatenate((X_test, X_train, X_val)) # maxes = [np.max(i) for i in X] # maxes = np.array(maxes) # print(len(maxes)) # print(np.max(maxes)) #<-- vocab size = 88586+1
# from keras import optimizers, losses, metrics, layers from tensorflow.keras.datasets import imdb # tf.__version__ # from keras.models import Sequential # from keras.layers import Dense, Dropout, Activation, Flatten # from keras.layers import Convolution2D, MaxPooling2D # from keras.utils import np_utils # from keras import backend as K os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' num_words = 30000 maxlen = 200 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words) print(x_train.shape, ' ', y_train.shape) print(x_test.shape, ' ', y_test.shape) x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen, padding='post') x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen, padding='post') print(x_train.shape, ' ', y_train.shape) print(x_test.shape, ' ', y_test.shape) def lstm_model(): model = keras.Sequential([ layers.Embedding(input_dim=num_words,
"""Prac2_imdb_classification.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1zs--3ULwCynfqLilHDzTUEoPb51c_qTT """ import numpy as np import matplotlib.pyplot as plt from tensorflow.keras.datasets import imdb from tensorflow.keras import models,layers from tensorflow import keras import tensorflow as tf (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Keep only top 10,000 most frequently words word_index = imdb.get_word_index() # word_index is a dictionary mapping words to an integer index. reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Reverses it, mapping integer indices to words # Decodes the review to read the content. Note that the indcies are offset by 3, # becuase 0, 1, 2 are reserved indices for "padding", "start of sequence," and "Unknown" decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[100]]) print(decoded_review) # one-hot encoding def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results