예제 #1
0
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
import pandas
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy

(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=666,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=666)

word_to_id = imdb.get_word_index()
word_to_id = {k: (v + 3) for k, v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value: key for key, value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[0]))

model = keras.Sequential()
예제 #2
0
from tensorflow.keras.datasets import imdb

# set parameters:
wandb.init()
config = wandb.config
config.vocab_size = 1000
config.maxlen = 1000
config.batch_size = 32
config.embedding_dims = 50
config.filters = 250
config.kernel_size = 3
config.hidden_dims = 250
config.epochs = 10

(X_train, y_train), (X_test,
                     y_test) = imdb.load_data(num_words=config.vocab_size)

X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)
print(X_train.shape)
print("After pre-processing", X_train[0])

# overide LSTM & GRU
if 'GPU' in str(device_lib.list_local_devices()):
    print("Using CUDA for RNN layers")
    LSTM = CuDNNLSTM
    GRU = CuDNNGRU

model = tf.keras.models.Sequential()
model.add(
    tf.keras.layers.Embedding(config.vocab_size,
예제 #3
0
maxlen = 400
embedding_dims = 100
epochs = 10
batch_size = 256
max_features = 5000

MODEL_NAME = 'TextBiRNN-epoch-10-emb-100-avg2-dense2'

use_early_stop = True
tensorboard_log_dir = 'logs\\{}'.format(MODEL_NAME)
# checkpoint_path = "save_model_dir\\{}\\cp-{epoch:04d}.ckpt".format(MODEL_NAME, '')
checkpoint_path = 'save_model_dir\\' + MODEL_NAME + '\\cp-{epoch:04d}.ckpt'
#  ====================================================================

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

print('Pad sequences (samples x time)...')
x_train = pad_sequences(x_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, padding='post')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

model_hepler = ModelHepler(class_num=class_num,
                           maxlen=maxlen,
                           max_features=max_features,
                           embedding_dims=embedding_dims,
                           epochs=epochs,
                           batch_size=batch_size)
model_hepler.get_callback(use_early_stop=use_early_stop,
                          tensorboard_log_dir=tensorboard_log_dir,
예제 #4
0
from tensorflow.keras import models
from tensorflow.keras import layers
import matplotlib.pyplot as plt


#랜덤 시드
np.random.seed(0)
number_of_features = 10000

np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
(data_train, target_train), (data_test, target_test) = imdb.load_data(num_words=number_of_features)


#영화영화 리뷰 데이터를 원-핫 인코딩된 특성 행렬로 변환
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")


# 신경망 모델을 만듭니다.
network = models.Sequential()

# 입력층으로 드롭아웃 층을 추가합니다.
network.add(layers.Dropout(0.3, input_shape=(number_of_features,)))

# 렐루 활성화 함수를 사용한 완전 연결 층을 추가합니다.
예제 #5
0
To make this tutorial easy to follow, we just treat IMDB dataset as a regression
dataset. It means we will treat prediction targets of IMDB dataset, which are 0s and
1s as numerical values, so that they can be directly used as the regression targets.

## A Simple Example
The first step is to prepare your data. Here we use the [IMDB
dataset](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) as
an example.
"""

import numpy as np
from tensorflow.keras.datasets import imdb

# Load the integer sequence the IMDB dataset with Keras.
index_offset = 3  # word index offset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000,
                                                      index_from=index_offset)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
# Prepare the dictionary of index to word.
word_to_id = imdb.get_word_index()
word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value: key for key, value in word_to_id.items()}
# Convert the word indices to words.
x_train = list(map(lambda sentence: ' '.join(
    id_to_word[i] for i in sentence), x_train))
x_test = list(map(lambda sentence: ' '.join(
    id_to_word[i] for i in sentence), x_test))
x_train = np.array(x_train, dtype=np.str)
예제 #6
0
args = parser.parse_args()

# Set seeds for reproducibility
tf.random.set_seed(500)

# Set global constants
vocabulary_size = 10000  # choose 10k most-used words for truncated vocabulary
sequence_length = 500  # choose 500-word sequences, either pad or truncate sequences to this
embedding_dims = 50  # number of dimensions to represent each word in vector space
batch_size = 100  # feed in the neural network in 100-example training batches
num_epochs = 10  # number of times the neural network goes over EACH training example
config = int(args.config)  # model configuration

# Load the IMDB dataset for sentiment classification
(X_train, Y_train), (X_test,
                     Y_test) = imdb.load_data(num_words=vocabulary_size)

# Pad & truncate sequences to fixed sequence length
X_train = pad_sequences(sequences=X_train, maxlen=sequence_length)
X_test = pad_sequences(sequences=X_test, maxlen=sequence_length)

# Create word-level binary sentiment classification model
# Input Layer
X = Input(shape=(sequence_length, ), batch_size=batch_size)

# Word-Embedding Layer
embedded = Embedding(input_dim=vocabulary_size, output_dim=embedding_dims)(X)

# Optional Self-Attention Mechanisms
if config == 1:
    embedded, attention_weights = SelfAttention(
예제 #7
0
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

(train_input, train_target), (test_input,
                              test_target) = imdb.load_data(num_words=500)
train_input, val_input, train_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=2021)

train_seq = pad_sequences(train_input, maxlen=100)
val_seq = pad_sequences(val_input, maxlen=100)

model = keras.Sequential()
model.add(keras.layers.Embedding(500, 16, input_length=100))
model.add(keras.layers.LSTM(8, dropout=0.3))
model.add(keras.layers.Dense(1, activation="sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate=1e-4)
model.compile(optimizer=rmsprop,
              loss="binary_crossentropy",
              metrics=["accuracy"])
checkpoint = keras.callbacks.ModelCheckpoint("best_lstm.h5")
stoppoint = keras.callbacks.EarlyStopping(patience=3,
                                          restore_best_weights=True)
history = model.fit(train_seq,
                    train_target,
                    epochs=100,
                    batch_size=64,
                    validation_data=(val_seq, val_target),
예제 #8
0
    parser.add_argument("--n_cells", required=False, default=32)
    parser.add_argument("--batch_size", required=False, default=32)
    parser.add_argument("--learning_rate", required=False, default=0.01)

    args = parser.parse_args()

    print("A kind of train word: ", args.num_words)
    print("A kind of non train word: ", args.skip_top)
    print("The number of cells: ", args.n_cells)
    print("Batch size: ", args.batch_size)
    print("Learning rate: ", args.learning_rate)

    # Load tensorflow keras imdb dataset.
    print("Loading...")
    (x_train_all, y_train_all), (x_test, y_test) = \
        imdb.load_data(skip_top=args.skip_top, num_words=args.num_words)

    # Remove all paddings 0, indents 1, no words 2.
    for i, sample in enumerate(x_train_all):
        x_train_all[i] = [n for n in sample if n > 2]

    # Load a dictionary.
    word2index = imdb.get_word_index()
    index2word = {word2index[k]: k for k in word2index}

    # Shffle dataset.
    np.random.seed(42)
    random_index = np.random.permutation(25000)
    x_train = x_train_all[random_index[:20000]]
    y_train = y_train_all[random_index[:20000]]
    x_val = x_train_all[random_index[20000:]]
예제 #9
0
                         for key, word in vocabulary_inv.items()}
    return embedding_weights




# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters
min_word_count = 1
context = 10

imdb = tf.keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000,start_char=None,
                                                              oov_char=None, index_from=None)      
x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

vocabulary = imdb.get_word_index()
vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
vocabulary_inv[0] = "<PAD/>"

embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)


#teacher-predictions
#teacher = 'lstm'
#def get_teacher_predictions(teacher):
#  if teacher == 'lstm':
예제 #10
0
            return lemmatizer.lemmatize(token, 'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')

    pre_proc_text = " ".join(
        [prat_lemmatize(token, tag) for token, tag in tagged_corpus])

    return pre_proc_text


# IMDB 데이터에 사용된 총 단어의 종류는 88,584개 (vocabulary 크기)이다.
# 가장 많이 사용되는 6,000개 단어만 사용하고, 나머지는 OOV로 표시한다.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=6000,
                                                      start_char=0,
                                                      oov_char=0,
                                                      index_from=0)

# train, test 데이터를 합친다. 필요한 경우 나중에 나눠쓴다.
text = np.hstack([x_train, x_test])
label = np.hstack([y_train, y_test])

# vocabulary를 가져온다.
word2idx = imdb.get_word_index()
idx2word = dict((v, k) for k, v in word2idx.items())

# start_char와 oov_char는 '.'으로 표시해 둔다. 나중에 전처리 과정에서 제거된다.
idx2word[0] = '.'


# 숫자로 표시된 x_train을 실제 단어로 변환한다.
예제 #11
0
    def run_imdb():
        # Extract useful data from dataset
        print('Extracting the IMDB dataset')
        (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

        # Illustration of the input data
        print(
            f'In this dataset a label of 1 indicates a positive review, 0 a negative review.\nHaving taken the top 10,000'
            f' most-used words no word index will exceed 10,000.\nMax Index = '
            f'{max([max(sequence) for sequence in train_data])}')

        print(
            f"For the sake of illustration, let's decode a review back to English (not being printed for easier reading)")
        word_index = imdb.get_word_index()
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
        # print(decoded_review)

        # Encoding the inputs
        print("In order to pass these lists of integers into a neural network we must first encode them as tensors of "
              "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.")

        def vectorise_sequences(sequences, dimension=10000):
            ret = np.zeros((len(sequences), dimension))
            for i, sequence in enumerate(sequences):
                ret[i, sequence] = 1
                if i < 1:
                    print(f"\n{sequence} => {ret[i]}\n")
            return ret

        x_train = vectorise_sequences(train_data)
        y_train = np.asarray(train_labels).astype('float32')
        x_test = vectorise_sequences(test_data)
        y_test = np.asarray(test_labels).astype('float32')

        # Design and compile the model
        print("Now to build the network, this time using parameters with greater configurability")
        model = models.Sequential()
        model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
        model.add(layers.Dense(16, activation='relu'))
        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy',
                      metrics=[metrics.binary_accuracy])

        # Divide the training data
        print("Creating a validation set for greater insight during training")
        x_val = x_train[:10000]  # Taking the 1st 10000 samples for validation
        partial_x_train = x_train[10000:]  # Leaving everything from 10000 onwards for training
        y_val = y_train[:10000]  # Taking the 1st 10000 labels for validation
        partial_y_train = y_train[10000:]  # Leaving everything from 10000 onwards for training

        # Train the model
        print("Begin training the model:")
        history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
        history_dict = history.history

        print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. "
              f"The keys are: {history_dict.keys()}")  # ['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy']

        # Prepare to plot the training and validation information
        loss_values = history_dict['loss']
        val_loss_values = history_dict['val_loss']
        acc_values = history_dict['binary_accuracy']
        val_acc_values = history_dict['val_binary_accuracy']

        epochs = range(1, len(history_dict['binary_accuracy']) + 1)
        plt.plot(epochs, loss_values, 'bo', label='Training Loss')
        plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

        plt.clf()
        plt.plot(epochs, acc_values, 'bo', label='Training Accuracy')
        plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy')
        plt.title('Training and Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

        # Evaluate the model
        print("\nAfter reviewing each plot, evaluate the performance of the model on new data")
        results = model.evaluate(x_test, y_test)
        print(f"Evaluation Results: Loss = {results[0]}    Accuracy = {results[1] * 100}%")
예제 #12
0
파일: data.py 프로젝트: jkjan/NLP
import tensorflow as tf
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)
print('리뷰의 최대 길이 : {}'.format(max(len(l) for l in X_train)))
print('리뷰의 평균 길이 : {}'.format(sum(map(len, X_train)) / len(X_train)))
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
print(np.shape(X_train), np.shape(y_train))
예제 #13
0
                    default=1,
                    metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--vocab-size',
                    type=int,
                    default=2000,
                    help='Max size of the vocabulary (default: 2000)')
parser.add_argument('--max-len',
                    type=int,
                    default=250,
                    help='Sequence max length (default: 80)')
args = parser.parse_args()

print('Loading data...')
(x_train, y_train), (x_test,
                     y_test) = imdb.load_data(num_words=args.vocab_size,
                                              maxlen=args.max_len)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=args.max_len)
x_test = sequence.pad_sequences(x_test, maxlen=args.max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Load ONNX model...')
onnx_model = onnx.load(args.model_path)

#onnx.checker.check_model(onnx_model)

print('Convert ONNX to Keras...')
예제 #14
0
                print(e)
    except:
        print(
            'Static mode selected but no memory limit set. Please set a memory limit by adding the flag -gm=X (gb) or --gpumemory=x (gb) after -m=s or --memory=s'
        )
        quit()
else:
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    for physical_device in physical_devices:
        tf.config.experimental.set_memory_growth(physical_device, True)

#%%
max_features = 2000
max_len = 500

(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=max_features)

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

model = tf.keras.models.Sequential()
model.add(
    layers.Embedding(max_features, 128, input_length=max_len, name='embed'))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))
model.summary()
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
예제 #15
0
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.datasets import imdb
    from tensorflow.keras.preprocessing import sequence
    from tensorflow.keras.utils import to_categorical

    vocab_size = 5000
    max_len = 256
    model_dim = 512  # Embedding后词向量大小
    batch_size = 128
    epochs = 10

    print("Data downloading and pre-processing ... ")
    (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=max_len,
                                                          num_words=vocab_size)

    x_train = sequence.pad_sequences(x_train, maxlen=max_len, padding='post')
    x_test = sequence.pad_sequences(x_test, maxlen=max_len, padding='post')
    x_train_masks = tf.equal(x_train, 0)
    x_test_masks = tf.equal(x_test, 0)
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    print('Model building ... ')
    inputs = Input(shape=(max_len, ), name="inputs")
    masks = Input(shape=(max_len, ), name='masks')
    embeddings = Embedding(vocab_size,
                           model_dim,
                           embeddings_initializer='uniform',
                           input_length=max_len,
# Import TensorFlow and TensorFlow Eager
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt
import os

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.datasets import imdb

tfe.enable_eager_execution()

# 25000 条训练数据和 25000 条测试数据
vocab_size = 10000
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

# Dict,键为词,值为词对应的标号
word_index = imdb.get_word_index()
# Dict,键为序号,值为词. 注意0是一个特殊的键,代表Unknown word
index_word = dict([(value, key) for (key, value) in word_index.items()])
# print("LEN:", len(set(index_word.keys())))
# print(index_word[int(2)])

# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([index_word.get(i - 3, '?') for i in train_data[10]])
# print(train_data[10])
# print(decoded_review)

# 全部句子截断或延长为80个词长的句子
예제 #17
0
from tensorflow.keras.datasets import imdb
import numpy as np
import matplotlib.pyplot as plt
from tuto_utils import util_func


# decoder
def decode_review(index_review):
    word_index = imdb.get_word_index()
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    # i - 3 because 0, 1, 2 are reserved indices for "padding", "start of sequence" and "unknown"
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in index_review])


# imdb ds, label 0 is neg, 1 is pos
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)


# np seems to allow np.array[i, []] = 1 type of assign, cool !
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


x_train, x_test = vectorize_sequences(train_data), vectorize_sequences(test_data)
y_train, y_test = np.asarray(train_labels).astype('float32'), np.asarray(test_labels).astype('float32')
epochs = 5

model = models.Sequential()
            else:
                result.append(0)
        return result

    # 模型评估
    def accuracy(self, y, y_pred):
        right = 0
        for i in range(len(y)):
            if y[i] == y_pred[i]:
                right += 1
        return right / len(y)


if __name__ == "__main__":
    # 引入IMDB电影评论数据做二分类
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(
        num_words=1000)  # 保留训练数据中前1000 个最常出现的单词

    # 填充列表,使其具有相同的长度,做one-hot编码
    def vectorize_sequences(sequences, dimension=1000):
        results = np.zeros((len(sequences), dimension))
        for i, sequence in enumerate(sequences):
            results[i, sequence] = 1.
        return results

    x_train = vectorize_sequences(train_data)
    x_test = vectorize_sequences(test_data)

    # 调用LR类
    LR = LogisticRegressionClassifier()
    # 训练模型
    LR.fit(x_train, train_labels)
# train it! Just because a model performs well on its training data doesn't mean that it will perform well on data it has never seen, and
# what you actually care about is your model's performance on new data (since you already know the labels of your training data -- obviously
# you don't need your model to predict those). For instance, it is possible that your model could end up merely _memorizing_ a mapping between
# your training samples and their targets -- which would be completely useless for the task of predicting targets for data never seen before.
# We will go over this point in much more detail in the next chapter.
#
# Just like the MNIST dataset, the IMDB dataset comes packaged with Keras. It has already been preprocessed: the reviews (sequences of words)
# have been turned into sequences of integers, where each integer stands for a specific word in a dictionary.
#
# The following code will load the dataset (when you run it for the first time, about 80MB of data will be downloaded to your machine):

# %%
from tensorflow.keras.datasets import imdb

(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(num_words=10000)

# %% [markdown]
#
# The argument `num_words=10000` means that we will only keep the top 10,000 most frequently occurring words in the training data. Rare words
# will be discarded. This allows us to work with vector data of manageable size.
#
# The variables `train_data` and `test_data` are lists of reviews, each review being a list of word indices (encoding a sequence of words).
# `train_labels` and `test_labels` are lists of 0s and 1s, where 0 stands for "negative" and 1 stands for "positive":

# %%
train_data[0]

# %%
train_labels[0]
예제 #20
0
import numpy

import tensorflow as tf

from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=top_words)
# Truncates and pads input sequences.
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


class SequenceClassifier(tf.keras.Model):
    def __init__(self, num_words, vector_size, batch_size):
        super(SequenceClassifier, self).__init__()

        self._embedding = Embedding(num_words,
                                    vector_size,
                                    batch_input_shape=(batch_size, None))
        self._lstm = tf.keras.layers.LSTM(10, return_state=True)
        self._dense = Dense(1, activation='sigmoid')
예제 #21
0
import tensorflow as tf
import numpy as np
from tensorflow.keras.datasets import imdb

number_of_words = 20000
max_len = 100

# load dataset
# https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded-when-allow-pickle-false-for-imdb-loa/56062555
# np_load_old = np.load
# np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
(X_train, y_train), (X_test,
                     y_test) = imdb.load_data(num_words=number_of_words)
# np.load = np_load_old

# padding

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=max_len)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)

#  RNN
model = tf.keras.Sequential()

model.add(
    tf.keras.layers.Embedding(input_dim=number_of_words,
                              output_dim=128,
                              input_shape=(X_train.shape[1], )))

# tf.keras.layers.CuDNNLSTM
model.add(tf.keras.layers.LSTM(units=128, activation='tanh'))
예제 #22
0
word2id = imdb.get_word_index()   # dictionary from words to integers (the id of the word in the vocab)
id2word = {i: word for word, i in word2id.items()}
# Embedding matrix holds the vector representation for the words.
embedding_matrix = np.zeros((vocabulary_size, 100)) # 50000
for word, index in word2id.items():
   if index > vocabulary_size - 1:
       continue
   else:
       embedding_vector = embeddings_index.get(word)
       if embedding_vector is not None:
           embedding_matrix[index] = embedding_vector

# for i in range(0, 3):
#    print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[i], embedding_matrix[i]))

(X_train_full, y_train_full), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)

train_size = len(X_train)
X_train_full = pad_sequences(X_train_full, maxlen=1000)
X_train, X_valid = X_train_full[5000:], X_train_full[:5000]
y_train, y_valid = y_train_full[5000:], y_train_full[:5000]

X_test = pad_sequences(X_test, maxlen=1000)

# activation_choice = 'tanh'
# drop_out_choice = 0.0
# optimizer_choice = 'adam'
# train_batch_size_choice = 32
# epochs_choice = 5

def part3(num_epochs, train_batch_size_choice, activation_choice, optimizer_choice, drop_out_choice):
예제 #23
0
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Conv1D, MaxPooling1D

np.random.seed(0)
tf.random.set_seed(0)

# data load
(X_tn0, y_tn0), (X_te0, y_test) = imdb.load_data(num_words = 2000)   # 주로 사용하는 단어만 사용
print(X_tn0.shape)     #(25000,)
print(y_tn0.shape)     #(25000,)
print(X_te0.shape)     #(25000,)
print(y_test.shape)    #(25000,)

X_train = X_tn0[:20000]
y_train = y_tn0[:20000]
X_valid = X_tn0[20000:25000]
y_valid = y_tn0[20000:25000]
print(X_train[0])
print(len(X_train[0]))  #218
print(len(X_train[1]))  #189
print(set(y_test))
print(len(set(y_test))) #2

# pre-processing
예제 #24
0
파일: lab6.py 프로젝트: Ksenox/ANN-2021
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

(X_train, y_train), (X_test, y_test) = imdb.load_data()

(training_data,
 training_targets), (testing_data,
                     testing_targets) = imdb.load_data(num_words=500)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join([reverse_index.get(i - 3, "#") for i in data[0]])
print(decoded)


def plot_loss(loss, v_loss):
    plt.figure(1, figsize=(8, 5))
    plt.plot(loss, 'b', label='train')
    plt.plot(v_loss, 'r', label='validation')
    plt.title('Loss')
    plt.ylabel('loss')
    plt.xlabel('epochs')
    plt.legend()
    plt.show()
    plt.clf()
예제 #25
0
    coded = [-2]
    coded.extend([index.get(i, 0) for i in txt])
    for i in range(len(coded)):
        if coded[i]:
            coded[i] += 3
        if coded[i] >= idim:
            coded[i] = 2
    print(coded)
    return coded


num = 10000
(training_data,
 training_targets), (testing_data,
                     testing_targets) = imdb.load_data(num_words=num)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

data = vectorize(data, num)
targets = np.array(targets).astype("float32")

test_x = data[:10000]
test_y = targets[:10000]
train_x = data[10000:]
train_y = targets[10000:]

model = models.Sequential()

model.add(layers.Dense(50, activation="relu", input_shape=(num, )))
model.add(layers.Dropout(0.5))
@author: santc
"""

## Etapa 2: Importação das bibliotecas
import tensorflow as tf
from tensorflow.keras.datasets import imdb

tf.__version__

## Etapa 3: Pré-processamento
### Configurando os parâmetros para a base de dados
number_of_words = 20000
max_len = 100

### Carregando a base de dados IMDB
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=number_of_words)
X_train.shape
X_train
X_train[0]
#Base de dados original com os textos: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

y_train

### Preenchimento das sequências (textos) para terem o mesmo tamanho
len(X_train[0])
len(X_train[1])
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
len(X_train[0])
len(X_train[1])
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)
예제 #27
0
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, SimpleRNN

seed = 10
np.random.seed(seed)  # 指定亂數種子
# 載入 IMDb 資料集
top_words = 1000
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=top_words)
# 資料預處理
max_words = 100
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
# 定義模型
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(SimpleRNN(32))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()  # 顯示模型摘要資訊
# 編譯模型
model.compile(loss="binary_crossentropy",
              optimizer="rmsprop",
              metrics=["accuracy"])
# 訓練模型
history = model.fit(X_train,
                    Y_train,
예제 #28
0
    args = modelParams()
    args.model_name = modelName
    args.save_dir = modelName + "/"

    githubFolder = modelName + "/"
    if not os.path.exists(githubFolder):
        os.makedirs(githubFolder)
    args.githubFolder = githubFolder

    #Read Data
    #X_train, X_validation, X_test, y_train, y_validation, y_test, num_classes = readPartialData(taskName,0.25)

    (X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb_full.pkl",
                                                          nb_words=None,
                                                          skip_top=0,
                                                          maxlen=None,
                                                          seed=113,
                                                          start_char=1,
                                                          oov_char=2,
                                                          index_from=3)

    X_val = X_test[0:10000]
    X_test = X_test[10000:]
    y_val = y_test[0:10000]
    y_test = y_test[10000:]

    X = np.concatenate((X_test, X_train, X_val))

    # maxes = [np.max(i) for i in X]
    # maxes = np.array(maxes)
    # print(len(maxes))
    # print(np.max(maxes))  #<-- vocab size = 88586+1
예제 #29
0
# from keras import optimizers, losses, metrics, layers
from tensorflow.keras.datasets import imdb

# tf.__version__
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, Flatten
# from keras.layers import Convolution2D, MaxPooling2D
# from keras.utils import np_utils
# from keras import backend as K

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

num_words = 30000
maxlen = 200

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)
print(x_train.shape, ' ', y_train.shape)
print(x_test.shape, ' ', y_test.shape)
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                     maxlen,
                                                     padding='post')
x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                    maxlen,
                                                    padding='post')
print(x_train.shape, ' ', y_train.shape)
print(x_test.shape, ' ', y_test.shape)


def lstm_model():
    model = keras.Sequential([
        layers.Embedding(input_dim=num_words,
"""Prac2_imdb_classification.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1zs--3ULwCynfqLilHDzTUEoPb51c_qTT
"""

import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
from tensorflow.keras import models,layers
from tensorflow import keras 
import tensorflow as tf

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Keep only top 10,000 most frequently words

word_index = imdb.get_word_index() # word_index is a dictionary mapping words to an integer index.
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Reverses it, mapping integer indices to words

# Decodes the review to read the content. Note that the indcies are offset by 3,
# becuase 0, 1, 2 are reserved indices for "padding", "start of sequence," and "Unknown" 
decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[100]])
print(decoded_review)

# one-hot encoding
def vectorize_sequences(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension))
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1
  return results