def dump_imdb_to_word2vec_corpus(filename, n_words=100000): train_data, valid_data, test_data = load_data(n_words=n_words) with open(filename, 'w') as fp: for data in [train_data, valid_data, test_data]: x_data, x_label = train_data for x in x_data: fp.write(' '.join([str(id) for id in list(x)]) + '\n')
def make_data_loaders(experiment_config: Dict[str, Any], hparams: Dict[str, Any]) -> Tuple[Sequence, Sequence]: (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=hparams["max_features"]) train = ImdbSequence(x_train, y_train, hparams["batch_size"], hparams["max_text_len"]) test = ImdbSequence(x_test, y_test, hparams["batch_size"], hparams["max_text_len"]) return (train, test)
def load_dataset(n_words=10000, maxlen=140): train, val, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) X_train, y_train = train X_val, y_val = val X_test, y_test = test X_train, X_val, X_test = map(pad_mask, [X_train, X_val, X_test]) y_train, y_val, y_test = map(np.asarray, [y_train, y_val, y_test]) return X_train, y_train, X_val, y_val, X_test, y_test
def kitti(mc, train_graph, eval_graph): with tf.name_scope("KITTI_input"): keys_to_features = imdb.get_keys_to_features() dataset_train_path = os.path.join(mc.DATA_PATH, "kitti_train.record") dataset_eval_path = os.path.join(mc.DATA_PATH, "kitti_val.record") # get anchor boxes before creating the input graph mc.ANCHOR_BOX, mc.ANCHORS = imdb.get_anchor_box_from_dataset(mc, dataset_train_path, keys_to_features) # prepare training dataset if train_graph: with train_graph.as_default(): dataset_train = tf.contrib.data.make_batched_features_dataset(dataset_train_path, mc.BATCH_SIZE, keys_to_features, num_epochs=None, reader_num_threads=mc.NUM_THREADS/2, parser_num_threads=mc.NUM_THREADS/2, shuffle_buffer_size=1200 if mc.IS_TRAINING else 512, sloppy_ordering=True) it_train = dataset_train.make_one_shot_iterator() train_list = imdb.load_data(it_train.get_next(), mc, training=True, image_decoder=tf.image.decode_png) else: train_list = None # prepare evaluation dataset if eval_graph: with eval_graph.as_default(): eval_mc = edict(mc.copy()) eval_mc.IS_TRAINING = False eval_mc.DATA_AUGMENTATION = False dataset_eval = tf.contrib.data.make_batched_features_dataset(dataset_eval_path, eval_mc.BATCH_SIZE, keys_to_features, num_epochs=None, reader_num_threads=mc.NUM_THREADS/2, parser_num_threads=mc.NUM_THREADS/2, shuffle=False, drop_final_batch=True) it_eval = dataset_eval.make_one_shot_iterator() eval_list = imdb.load_data(it_eval.get_next(), eval_mc, training=False, image_decoder=tf.image.decode_png) else: eval_list = None mc.EVAL_TOOL_PATH = os.path.join(os.path.dirname(__file__), "kitti-eval/cpp/evaluate_object") return train_list, eval_list, mc
def load_data( maxlen=3000 ): ''' Load dataset ''' train, valid, test = imdb.load_data() tr_inp, _, tr_targ = imdb.prepare_data( train[0], train[1], maxlen=maxlen ) te_inp, _, te_targ = imdb.prepare_data( test[0], test[1], maxlen=maxlen ) v_inp, _, v_targ = imdb.prepare_data( valid[0], valid[1], maxlen=maxlen ) train = shuffle( np.transpose( tr_inp ), reformat( np.asarray( tr_targ ), 2 ) ) test = shuffle( np.transpose( te_inp ), reformat( np.asarray( te_targ ), 2 ) ) valid = shuffle( np.transpose( v_inp ), reformat( np.asarray( v_targ ), 2 ) ) print "Train shape : {}, {}".format( train[0].shape, train[1].shape ) print "Test shape : {}, {}".format( test[0].shape, test[1].shape ) print "Valid shape : {}, {}".format( valid[0].shape, valid[1].shape ) imdb_dict = pickle.load( open('imdb.dict.pkl','rb') ) return train, test, valid, imdb_dict
def main(unused_args): maxlen = 100 n_words = 10000 print('Loading data') train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) train = imdb.prepare_data(train[0], train[1], maxlen=maxlen) valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen) test = imdb.prepare_data(test[0], test[1], maxlen=maxlen) for data in [train, valid, test]: print(data[0].shape, data[1].shape, data[2].shape) config = get_config() eval_config = get_config() #eval_config.batch_size = 1 #eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = SentimentModel(is_training=True, config=config) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = SentimentModel(is_training=False, config=config) mtest = SentimentModel(is_training=False, config=config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) start_time = time.time() train_acc = run_epoch(session, m, train, m.train_op) print("Training Accuracy = %.4f, time = %.3f seconds\n" % (train_acc, time.time() - start_time)) valid_acc = run_epoch(session, mvalid, valid, tf.no_op()) print("Valid Accuracy = %.4f\n" % valid_acc) test_acc = run_epoch(session, mtest, test, tf.no_op()) print("Test Accuracy = %.4f\n" % test_acc)
def generate_data(self): '''Load the dataset Generate train, valid and test dataset ''' print("Loading data...") train, valid, _ = load_data(path=self.path) self.X_train, self.X_mask_train, self.Y_train = prepare_data(train[0], train[1], maxlen=self.maxlen) self.X_valid, self.X_mask_valid, self.Y_valid = prepare_data(valid[0], valid[1], maxlen=self.maxlen) del train, valid print(len(self.X_train), 'train sequences') print(len(self.X_valid), 'valid sequences') print("Pad sequences (samples x time)") self.X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen) self.X_valid = sequence.pad_sequences(self.X_valid, maxlen=self.maxlen) print('X_train shape:', self.X_train.shape) print('X_valid shape:', self.X_valid.shape)
def main(unused_args): maxlen = 100 n_words = 10000 print('Loading data') train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) train = imdb.prepare_data(train[0], train[1], maxlen=maxlen) valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen) test = imdb.prepare_data(test[0], test[1], maxlen=maxlen) for data in [train, valid, test]: print(data[0].shape, data[1].shape, data[2].shape) config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = SentimentModel(is_training=True, config=config) with tf.variable_scope("model", reuse = True, initializer=initializer): mvalid = SentimentModel(is_training=False, config=config) mtest = SentimentModel(is_training=False, config=config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) start_time = time.time() train_acc = run_epoch(session, m, train, m.train_op) print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time)) valid_acc = run_epoch(session, mvalid, valid, tf.no_op()) print("Valid Accuracy = %.4f\n" % valid_acc) test_acc = run_epoch(session, mtest, test, tf.no_op()) print("Test Accuracy = %.4f\n" % test_acc)
# In[3]: # Some of the code and explaination here is taken from https://github.com/Hvass-Labs/ :) # In[4]: import imdb # this is helper package to download and load the imdb dataset by https://github.com/Hvass-Labs/ # In[5]: imdb.maybe_download_and_extract() #Downloading and Extracting the dataset # In[6]: x_train_text, y_train = imdb.load_data(train=True) #loading train data x_test_text, y_test = imdb.load_data(train=False) # loading test data # In[7]: print("Train-set size: ", len(x_train_text)) print("Test-set size: ", len(x_test_text)) # In[8]: data_text = x_train_text + x_test_text # In[9]: x_train_text[100] # looking at an example text
import matplotlib.pyplot as plt import tensorflow as tf import numpy as np from scipy.spatial.distance import cdist from keras.models import Sequential from keras.layers import Dense, GRU, Embedding from keras.optimizers import Adam from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences import imdb imdb.maybe_download_and_extract() x_train_text, y_train = imdb.load_data(train=True) x_test_text, y_test = imdb.load_data(train=False) data_text = x_test_text + x_test_text print("Train-set size: ", len(x_train_text)) print("Test-set size: ", len(x_test_text)) print("Train sample ", x_test_text[1]) print("Review result ", y_train[1]) num_words = 10000 tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(data_text) if num_words is None:
filter_length = 3 nb_filter = 100 # LSTM lstm_output_size = 64 # vanilla layer hidden_dims = 64 # Training batch_size = 32 nb_epoch = 30 print("Loading data...") train, valid, test = imdb.load_data(nb_words=max_features, valid_portion=0.0) print(len(train[0]), 'train sequences') print(len(valid[0]), 'valid sequences') print(len(test[0]), 'test sequences') X_train, y_train = train X_valid, y_valid = valid X_test, y_test = test print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_valid = sequence.pad_sequences(X_valid,maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_valid shape:', X_valid.shape) print('X_test shape:', X_test.shape)
def pascal_voc(mc, train_graph, eval_graph): with tf.name_scope("PASCAL_VOC_input") as scope: keys_to_features = imdb.get_keys_to_features() # set initial record paths to read data from dataset_train_path = os.path.join(mc.DATA_PATH, "pascal_voc_train.record") dataset_eval_path = os.path.join(mc.DATA_PATH, "pascal_voc_val.record") mc.ANCHOR_BOX, mc.ANCHORS = imdb.get_anchor_box_from_dataset( mc, dataset_train_path, keys_to_features) # create a new dataset with preprocessed/filtered records if (mc.REDUCE_DATASET): if (not mc.ALREADY_PREPROCESSED): imdb.reduce_dataset_by_class(mc, keys_to_features, dataset_set="train") if (eval_graph): eval_mc = edict(mc.copy()) # eval_mc.BATCH_SIZE = 1 eval_mc.IS_TRAINING = False eval_mc.DATA_AUGMENTATION = False mc.EVAL_ITERS = imdb.reduce_dataset_by_class( eval_mc, keys_to_features, dataset_set="val") eval_mc.EVAL_ITERS = mc.EVAL_ITERS print("EVAL ITERS :%d" % (mc.EVAL_ITERS)) else: pass # with open(os.path.join(mc.TRAIN_DIR, "BGR_MEANS.txt"), "r") as f: # mc.BGR_MEANS = np.fromstring(f.readline().split("[[[")[1].split("]]]")[0], sep =" ") if (mc.PREPROCESSED_DATA_DIR): dataset_train_path = os.path.join( mc.PREPROCESSED_DATA_DIR, "preprocessed_pascal_voc_train.record") dataset_eval_path = os.path.join( mc.PREPROCESSED_DATA_DIR, "preprocessed_pascal_voc_val.record") else: dataset_train_path = os.path.join( mc.TRAIN_DIR, "preprocessed_pascal_voc_train.record") dataset_eval_path = os.path.join( mc.TRAIN_DIR, "preprocessed_pascal_voc_val.record") elif eval_graph: mc.EVAL_ITERS = imdb.get_num_images(mc, keys_to_features, dataset_set="val") # prepare training dataset if train_graph: with train_graph.as_default(): dataset_train = tf.contrib.data.make_batched_features_dataset( dataset_train_path, mc.BATCH_SIZE, keys_to_features, num_epochs=None, reader_num_threads=mc.NUM_THREADS / 2, parser_num_threads=mc.NUM_THREADS / 2, shuffle_buffer_size=12000, shuffle=True, sloppy_ordering=True) it_train = dataset_train.make_one_shot_iterator() train_list = imdb.load_data(it_train.get_next(), mc, training=True, image_decoder=tf.image.decode_jpeg) else: train_list = None # prepare evaluation dataset if (eval_graph): with eval_graph.as_default(): dataset_eval = tf.contrib.data.make_batched_features_dataset( dataset_eval_path, mc.BATCH_SIZE, keys_to_features, num_epochs=None, reader_num_threads=mc.NUM_THREADS / 2, parser_num_threads=mc.NUM_THREADS / 2, shuffle=False) it_eval = dataset_eval.make_one_shot_iterator() eval_mc = edict(mc.copy()) eval_mc.IS_TRAINING = False eval_mc.DATA_AUGMENTATION = False eval_list = imdb.load_data(it_eval.get_next(), eval_mc, training=False, image_decoder=tf.image.decode_jpeg) else: eval_list = None return train_list, eval_list, mc
from collections import OrderedDict import cPickle as pkl import sys import time import numpy as np import theano import theano.tensor as T from theano import config from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import imdb # データセット datasets = {"imdb": (imdb.load_data(path="data/imdb/imdb.pkl"), imdb.prepare_data)} # random number generators seeds for consistency SEED = 123 np.random.seed(SEED) # floatX型の配列の取得 def numpy_floatX(data): """ 配列をtheanoのfloatX型へ変換する """ return np.asarray(data, dtype=config.floatX) # データセットの取得 def get_dataset(name): return datasets[name][0], datasets[name][1]
from collections import OrderedDict import cPickle as pkl import sys import time import numpy as np import theano import theano.tensor as T from theano import config from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import imdb # データセット datasets = {'imdb': (imdb.load_data(path='data/imdb/imdb.pkl'), imdb.prepare_data)} # random number generators seeds for consistency SEED = 123 np.random.seed(SEED) # floatX型の配列の取得 def numpy_floatX(data): ''' 配列をtheanoのfloatX型へ変換する ''' return np.asarray(data, dtype=config.floatX) # データセットの取得 def get_dataset(name): return datasets[name][0], datasets[name][1] # モデルのzip
from keras.layers import Dense from keras import optimizers from keras import losses from keras import metrics def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Get word_index and decode reviews word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ''.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # Preparing data into binary matrix x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32')
import tensorflow as tf import numpy as np from scipy.spatial.distance import cdist # from tf.keras.models import Sequential # This does not work! from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense, GRU, Embedding from tensorflow.python.keras.optimizers import Adam from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences import imdb imdb.maybe_download_and_extract() input_text_train, target_train = imdb.load_data(train=True) input_text_test, target_test = imdb.load_data(train=False) print("Size of the trainig set: ", len(input_text_train)) print("Size of the testing set: ", len(input_text_test)) text_data = input_text_train + input_text_test print('Sample example from the training set...') print(input_text_train[1]) print('Sample example actual sentiment...') print(target_train[1]) #Include only the popular ones num_top_words = 10000
def get_imdb(n_words=100000, varid_portion=0.1, maxlen=None): """ Get of Large Movie Review datasets """ return imdb.load_data(path="./imdb.pkl", n_words = n_words, valid_portion=varid_portion, maxlen=maxlen)
def coco(mc, train_graph, eval_graph): with tf.name_scope("COCO_input"): keys_to_features = imdb.get_keys_to_features() # return tf.parse_example(batch_records, keys_to_features) dataset_train_path = os.path.join(mc.DATA_PATH, "coco_train.record") dataset_eval_path = os.path.join(mc.DATA_PATH, "coco_val.record") # create a new dataset with preprocessed/filtered records if (mc.REDUCE_DATASET and not mc.ALREADY_PREPROCESSED): imdb.reduce_dataset_by_class(mc, keys_to_features, dataset_set="train") if (eval_graph): eval_mc = edict(mc.copy()) # eval_mc.BATCH_SIZE = 1 eval_mc.IS_TRAINING = False eval_mc.DATA_AUGMENTATION = False mc.EVAL_ITERS = imdb.reduce_dataset_by_class(eval_mc, keys_to_features, dataset_set="val") eval_mc.EVAL_ITERS = mc.EVAL_ITERS dataset_train_path = os.path.join( mc["BASE_DIR"], "preprocessed_" + mc.DATASET_NAME.lower() + "_train.record") dataset_eval_path = os.path.join( mc["BASE_DIR"], "preprocessed_" + mc.DATASET_NAME.lower() + "_val.record") print("EVAL ITERS :%d" % (mc.EVAL_ITERS)) if (mc.REDUCE_DATASET and mc.PREPROCESSED_DATA_DIR): dataset_train_path = os.path.join( mc["PREPROCESSED_DATA_DIR"], "preprocessed_" + mc.DATASET_NAME.lower() + "_train.record") dataset_eval_path = os.path.join( mc["PREPROCESSED_DATA_DIR"], "preprocessed_" + mc.DATASET_NAME.lower() + "_val.record") # get anchor boxes before creating the input graph mc.ANCHOR_BOX, mc.ANCHORS = imdb.get_anchor_box_from_dataset( mc, dataset_train_path, keys_to_features) # prepare training dataset if train_graph: with train_graph.as_default(): dataset_train = tf.contrib.data.make_batched_features_dataset( dataset_train_path, mc.BATCH_SIZE, keys_to_features, num_epochs=mc.TRAIN_EPOCHS, reader_num_threads=8, parser_num_threads=8, shuffle_buffer_size=13000 if mc.IS_TRAINING else 512, sloppy_ordering=True) it_train = dataset_train.make_one_shot_iterator() train_list = imdb.load_data(it_train.get_next(), mc, training=True, image_decoder=tf.image.decode_jpeg) else: train_list = None # prepare evaluation dataset if eval_graph: with eval_graph.as_default(): eval_mc = edict(mc.copy()) # eval_mc.BATCH_SIZE = 1 eval_mc.IS_TRAINING = False eval_mc.DATA_AUGMENTATION = False dataset_eval = tf.contrib.data.make_batched_features_dataset( dataset_eval_path, eval_mc.BATCH_SIZE, keys_to_features, num_epochs=None, reader_num_threads=8, parser_num_threads=8, shuffle=False, drop_final_batch=True) it_eval = dataset_eval.make_one_shot_iterator() eval_list = imdb.load_data(it_eval.get_next(), eval_mc, training=False, image_decoder=tf.image.decode_png) else: eval_list = None return train_list, eval_list, mc
return zip(range(len(minibatches)), minibatches) if __name__ == "__main__": vocab_size = 10000 embsize = 128 hiddensize = 128 nstep = 100 mini_batch_size = 16 max_epochs = 20 np.random.seed(4488) print('Loading data') train, valid, test = imdb.load_data(n_words=10000, valid_portion=0.05,maxlen=100) train = test ydim = np.max(train[1]) + 1 print('Build model ') X,Mask,Y,\ cost,err, \ train_function, valid_function, predict_function = build_model(vocab_size=vocab_size, embsize=embsize, hiddensize=hiddensize) print('Training ') for eidx in range(max_epochs): kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True) costs = [] errs = []
def load_my_data(): train_data, valid_data, test_data = load_data(n_words=n_words) train_x, train_y = train_data print train_x[0], train_y[0] return train_data, valid_data, test_data
from imdb import load_data, prepare_data import numpy as np import pickle as pkl train, valid, test = load_data(n_words=10, valid_portion=0.05) x = [train[0][t] for t in range(0, len(train[0]))] y = [train[1][t] for t in range(0, len(train[1]))] x, mask, y = prepare_data(x, y) y = np.array(y) feat_train = np.zeros((x.shape[0], x.shape[1], 10)) for i in range(0, x.shape[0]): print "num: " + str(i) for j in range(0, x.shape[1]): feat_train[i][j][x[i][j]] = 1 np.save("data/feats_train.npy", feat_train) np.save("data/labels_train.npy", y) np.save("data/mask_train.npy", mask)
from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Embedding, Dense, LSTM, Dropout import imdb max_features = 20000 max_length = 80 embedding_dim = 256 batch_size = 128 epochs = 10 modes = [0, 1, 2] print('Loading data...') # 这里需要上传数据,load_data的函数里面的参数需要了解下 get_file_path = "D:\用户目录\Desktop\郭磊\keras\imdb.npz" (X_train, y_train), (X_test, y_test) = imdb.load_data(get_file_path, num_words=max_features) X_train = sequence.pad_sequences(X_train, max_length) X_test = sequence.pad_sequences(X_test, max_length) # 没有介绍如何对标签的处理 # Compile and train different models while measuring performance. results = [] for mode in modes: print('Testing mode: implementation={}'.format(mode)) model = Sequential() model.add(Embedding(max_features, embedding_dim, input_length=max_length)) model.add(Dropout(0.2)) model.add( LSTM(embedding_dim, dropout=0.2,
return zip(range(len(minibatches)), minibatches) if __name__ == "__main__": vocab_size = 10000 embsize = 128 hiddensize = 128 nstep = 100 mini_batch_size = 16 max_epochs = 20 np.random.seed(4488) print('Loading data') train, valid, test = imdb.load_data(n_words=10000, valid_portion=0.05, maxlen=100) train = test ydim = np.max(train[1]) + 1 print('Build model ') X,Mask,Y,\ cost,err, \ train_function, valid_function, predict_function = build_model(vocab_size=vocab_size, embsize=embsize, hiddensize=hiddensize) print('Training ') for eidx in range(max_epochs): kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True) costs = []
#!/usr/bin/env python # coding: utf-8 # In[1]: import keras.datasets import imdb (train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words=10000) print(max([max(sequence) for sequence in train_data])) # In[2]: from keras.datasets import imdb (train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words=10000) print(max([max(sequence) for sequence in train_data])) # In[3]: word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = '',join([reverse_word_index.get(i-3, '?') for i in train_data[0]])
It is being called by main.py. """ import os #import dash import dash_html_components as html import dash_core_components as dcc #from dash.dependencies import Input, Output import imdb DATA_DIR = 'data' PROCESSED_DIR = 'processed' IMDB_FILE = 'imdb_df.csv' GENRES_FILE = 'set_genres.pkl' IMDB_PATH = os.path.join(DATA_DIR, PROCESSED_DIR, IMDB_FILE) GENRES_PATH = os.path.join(DATA_DIR, PROCESSED_DIR, GENRES_FILE) df_imdb = imdb.load_data(IMDB_PATH) genres = imdb.load_genres(GENRES_PATH) external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] tab2_layout = html.Div([ html.Div([ html.Label('Filter by:'), dcc.Checklist(id='filter-checklist', options=[{ 'label': 'Genre', 'value': 'Genre' }, { 'label': 'Year', 'value': 'Year' }],
# Convolution filter_length = 3 nb_filter = 100 # LSTM lstm_output_size = 64 # vanilla layer hidden_dims = 64 # Training batch_size = 32 nb_epoch = 30 print("Loading data...") train, valid, test = imdb.load_data(nb_words=max_features, valid_portion=0.0) print(len(train[0]), 'train sequences') print(len(valid[0]), 'valid sequences') print(len(test[0]), 'test sequences') X_train, y_train = train X_valid, y_valid = valid X_test, y_test = test print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_valid = sequence.pad_sequences(X_valid, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_valid shape:', X_valid.shape) print('X_test shape:', X_test.shape)
import preprocess as pre import imdb import numpy as np import matplotlib.pyplot as plt import gc import InitializeModel as im import pickle print('reading vocabulary list') word_index = imdb.get_word_index() # word_index = imdb.get_filtered_word_index() print('reading IMDB_data') (train_data, train_labels), (test_data, test_labels) = imdb.load_data(path='imdb', num_words=10000) data = np.concatenate((train_data, test_data), axis=0) targets = np.concatenate((train_labels, test_labels), axis=0) ''' optional:Analyzing Dataset ''' print("Categories:", np.unique(targets)) print("Number of unique words:", len(np.unique(np.hstack(data)))) # 将word_index反转,实现将整数索引到单词的映射 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) print('decoded sample: ')
def train_lstm( dim_proj=128, # word embeding dimension and LSTM number of hidden units. patience=10, # Number of epoch to wait before early stop if no progress max_epochs=50, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='save/lstm_model.npz', # The best model will be saved there validFreq=370, # Compute the validation error after this number of update. saveFreq=50, # Save the parameters after every saveFreq updates maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. ): # Model options model_options = locals().copy() print("model options", model_options) #load_data, prepare_data = get_dataset(dataset) print('Loading data') train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = np.arange(len(test[0])) np.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = np.max(train[1]) + 1 model_options['ydim'] = ydim print('Building model') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = T.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = T.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=list(tparams.values())) f_grad = T.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print('Optimization') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print("%d train examples" % len(train[0])) print("%d valid examples" % len(valid[0])) print("%d test examples" % len(test[0])) history_errs = [] best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) // batch_size if saveFreq == -1: saveFreq = len(train[0]) // batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in range(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = imdb.prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if np.isnan(cost) or np.isinf(cost): print('bad cost detected: ', cost) return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost) if saveto and np.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print('Done') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, imdb.prepare_data, train, kf) valid_err = pred_error(f_pred, imdb.prepare_data, valid, kf_valid) test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test) history_errs.append([valid_err, test_err]) if (best_p is None or valid_err <= np.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print('Early Stop!') estop = True break print('Seen %d samples' % n_samples) if estop: break except KeyboardInterrupt: print("Training interupted") end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, imdb.prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, imdb.prepare_data, valid, kf_valid) test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test) print( 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err ) if saveto: np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print('The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))) print( ('Training took %.1fs' % (end_time - start_time)), file=sys.stderr) return train_err, valid_err, test_err
from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense, GRU, Embedding from tensorflow.python.keras.optimizers import Adam from tensorflow.python.keras.preprocessing.text import Tokenizer from tensorflow.python.keras.preprocessing.sequence import pad_sequences import imdb import helper_functions # Data Import #------------------------------------------------------------------------------ imdb.maybe_download_and_extract() x_train_text, y_train = imdb.load_data( train=True) # I added utf-8 encoding in the code in imdb x_test_text, y_test = imdb.load_data(train=False) data_text = x_train_text + x_test_text # Tokenizer #------------------------------------------------------------------------------ num_words = 30000 tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(data_text) x_train_tokens = tokenizer.texts_to_sequences(x_train_text) x_test_tokens = tokenizer.texts_to_sequences(x_test_text)