예제 #1
0
def dump_imdb_to_word2vec_corpus(filename, n_words=100000):
    train_data, valid_data, test_data = load_data(n_words=n_words)
    with open(filename, 'w') as fp:
        for data in [train_data, valid_data, test_data]:
            x_data, x_label = train_data
            for x in x_data:
                fp.write(' '.join([str(id) for id in list(x)]) + '\n')
예제 #2
0
def make_data_loaders(experiment_config: Dict[str, Any],
                      hparams: Dict[str, Any]) -> Tuple[Sequence, Sequence]:
    (x_train,
     y_train), (x_test,
                y_test) = imdb.load_data(num_words=hparams["max_features"])
    train = ImdbSequence(x_train, y_train, hparams["batch_size"],
                         hparams["max_text_len"])
    test = ImdbSequence(x_test, y_test, hparams["batch_size"],
                        hparams["max_text_len"])
    return (train, test)
예제 #3
0
def load_dataset(n_words=10000, maxlen=140):
    train, val, test = imdb.load_data(n_words=n_words, valid_portion=0.05,
                                      maxlen=maxlen)

    X_train, y_train = train
    X_val, y_val = val
    X_test, y_test = test

    X_train, X_val, X_test = map(pad_mask, [X_train, X_val, X_test])
    y_train, y_val, y_test = map(np.asarray, [y_train, y_val, y_test])

    return X_train, y_train, X_val, y_val, X_test, y_test
예제 #4
0
def kitti(mc, train_graph, eval_graph):
  with tf.name_scope("KITTI_input"):
    keys_to_features = imdb.get_keys_to_features()

    dataset_train_path = os.path.join(mc.DATA_PATH, "kitti_train.record")
    dataset_eval_path = os.path.join(mc.DATA_PATH, "kitti_val.record")
    
    # get anchor boxes before creating the input graph
    mc.ANCHOR_BOX, mc.ANCHORS = imdb.get_anchor_box_from_dataset(mc, dataset_train_path, keys_to_features)
    
    # prepare training dataset
    if train_graph:
      with train_graph.as_default():
        dataset_train = tf.contrib.data.make_batched_features_dataset(dataset_train_path,
              mc.BATCH_SIZE, keys_to_features, num_epochs=None,
              reader_num_threads=mc.NUM_THREADS/2, parser_num_threads=mc.NUM_THREADS/2, shuffle_buffer_size=1200 if mc.IS_TRAINING else 512, sloppy_ordering=True)
        it_train = dataset_train.make_one_shot_iterator()
        train_list = imdb.load_data(it_train.get_next(), mc, training=True, image_decoder=tf.image.decode_png)
    else:
      train_list = None
    
    # prepare evaluation dataset
    if eval_graph:
      with eval_graph.as_default():
        eval_mc = edict(mc.copy())
        eval_mc.IS_TRAINING = False
        eval_mc.DATA_AUGMENTATION = False
        dataset_eval = tf.contrib.data.make_batched_features_dataset(dataset_eval_path,
               eval_mc.BATCH_SIZE, keys_to_features, num_epochs=None,
              reader_num_threads=mc.NUM_THREADS/2, parser_num_threads=mc.NUM_THREADS/2, shuffle=False, drop_final_batch=True)
        it_eval = dataset_eval.make_one_shot_iterator()
        eval_list = imdb.load_data(it_eval.get_next(), eval_mc, training=False, image_decoder=tf.image.decode_png)
    else:
      eval_list = None
  
  mc.EVAL_TOOL_PATH = os.path.join(os.path.dirname(__file__), "kitti-eval/cpp/evaluate_object")
  
  return train_list, eval_list, mc
def load_data( maxlen=3000 ):
    ''' Load dataset '''
    train, valid, test = imdb.load_data()
    tr_inp, _, tr_targ = imdb.prepare_data( train[0], train[1], maxlen=maxlen )
    te_inp, _, te_targ = imdb.prepare_data( test[0], test[1], maxlen=maxlen )
    v_inp, _, v_targ = imdb.prepare_data( valid[0], valid[1], maxlen=maxlen )
    train = shuffle( np.transpose( tr_inp ), reformat( np.asarray( tr_targ ), 2 ) )
    test = shuffle( np.transpose( te_inp ), reformat( np.asarray( te_targ ), 2 ) )
    valid = shuffle( np.transpose( v_inp ), reformat( np.asarray( v_targ ), 2 ) )
    print "Train shape : {}, {}".format( train[0].shape, train[1].shape )
    print "Test shape : {}, {}".format( test[0].shape, test[1].shape )
    print "Valid shape : {}, {}".format( valid[0].shape, valid[1].shape )
    imdb_dict = pickle.load( open('imdb.dict.pkl','rb') )
    return train, test, valid, imdb_dict
예제 #6
0
def main(unused_args):

    maxlen = 100
    n_words = 10000

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=n_words,
                                        valid_portion=0.05,
                                        maxlen=maxlen)

    train = imdb.prepare_data(train[0], train[1], maxlen=maxlen)
    valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen)
    test = imdb.prepare_data(test[0], test[1], maxlen=maxlen)

    for data in [train, valid, test]:
        print(data[0].shape, data[1].shape, data[2].shape)

    config = get_config()
    eval_config = get_config()
    #eval_config.batch_size = 1
    #eval_config.num_steps = 1

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = SentimentModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = SentimentModel(is_training=False, config=config)
            mtest = SentimentModel(is_training=False, config=config)

        tf.initialize_all_variables().run()

        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            start_time = time.time()
            train_acc = run_epoch(session, m, train, m.train_op)
            print("Training Accuracy = %.4f, time = %.3f seconds\n" %
                  (train_acc, time.time() - start_time))
            valid_acc = run_epoch(session, mvalid, valid, tf.no_op())
            print("Valid Accuracy = %.4f\n" % valid_acc)

        test_acc = run_epoch(session, mtest, test, tf.no_op())
        print("Test Accuracy = %.4f\n" % test_acc)
    def generate_data(self):
        '''Load the dataset

        Generate train, valid and test dataset

        '''
        print("Loading data...")
        train, valid, _ = load_data(path=self.path)
        self.X_train, self.X_mask_train, self.Y_train = prepare_data(train[0], train[1], maxlen=self.maxlen)
        self.X_valid, self.X_mask_valid, self.Y_valid = prepare_data(valid[0], valid[1], maxlen=self.maxlen)
        del train, valid
        print(len(self.X_train), 'train sequences')
        print(len(self.X_valid), 'valid sequences')
        print("Pad sequences (samples x time)")
        self.X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen)
        self.X_valid = sequence.pad_sequences(self.X_valid, maxlen=self.maxlen)
        print('X_train shape:', self.X_train.shape)
        print('X_valid shape:', self.X_valid.shape)
예제 #8
0
def main(unused_args):
    
    maxlen = 100
    n_words = 10000

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen)

    train = imdb.prepare_data(train[0], train[1], maxlen=maxlen)
    valid = imdb.prepare_data(valid[0], valid[1], maxlen=maxlen)
    test = imdb.prepare_data(test[0], test[1], maxlen=maxlen)

    for data in [train, valid, test]:
        print(data[0].shape, data[1].shape, data[2].shape)

    config = get_config()
    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = SentimentModel(is_training=True, config=config)
        with tf.variable_scope("model", reuse = True, initializer=initializer):
            mvalid = SentimentModel(is_training=False, config=config)
            mtest = SentimentModel(is_training=False, config=config)

        tf.initialize_all_variables().run()
        
        for i in range(config.max_max_epoch):
            lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
            m.assign_lr(session, config.learning_rate * lr_decay)

            print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
            start_time = time.time()
            train_acc = run_epoch(session, m, train, m.train_op) 
            print("Training Accuracy = %.4f, time = %.3f seconds\n"%(train_acc, time.time()-start_time))
            valid_acc = run_epoch(session, mvalid, valid, tf.no_op())
            print("Valid Accuracy = %.4f\n" % valid_acc)

        test_acc = run_epoch(session, mtest, test, tf.no_op())
        print("Test Accuracy = %.4f\n" % test_acc)
# In[3]:

# Some of the code and explaination here is taken from https://github.com/Hvass-Labs/ :)

# In[4]:

import imdb  # this is helper package to download and load the imdb dataset by https://github.com/Hvass-Labs/

# In[5]:

imdb.maybe_download_and_extract()  #Downloading and Extracting the dataset

# In[6]:

x_train_text, y_train = imdb.load_data(train=True)  #loading train data
x_test_text, y_test = imdb.load_data(train=False)  # loading test data

# In[7]:

print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

# In[8]:

data_text = x_train_text + x_test_text

# In[9]:

x_train_text[100]  # looking at an example text
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import imdb

imdb.maybe_download_and_extract()
x_train_text, y_train = imdb.load_data(train=True)
x_test_text, y_test = imdb.load_data(train=False)

data_text = x_test_text + x_test_text

print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

print("Train sample ", x_test_text[1])
print("Review result ", y_train[1])

num_words = 10000
tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(data_text)

if num_words is None:
filter_length = 3
nb_filter = 100


# LSTM
lstm_output_size = 64

# vanilla layer
hidden_dims = 64

# Training
batch_size = 32
nb_epoch = 30

print("Loading data...")
train, valid, test = imdb.load_data(nb_words=max_features,
                                    valid_portion=0.0)
print(len(train[0]), 'train sequences')
print(len(valid[0]), 'valid sequences')
print(len(test[0]), 'test sequences')

X_train, y_train = train
X_valid, y_valid = valid
X_test, y_test = test

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_valid = sequence.pad_sequences(X_valid,maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)
print('X_test shape:', X_test.shape)
예제 #12
0
def pascal_voc(mc, train_graph, eval_graph):
    with tf.name_scope("PASCAL_VOC_input") as scope:
        keys_to_features = imdb.get_keys_to_features()
        # set initial record paths to read data from
        dataset_train_path = os.path.join(mc.DATA_PATH,
                                          "pascal_voc_train.record")
        dataset_eval_path = os.path.join(mc.DATA_PATH, "pascal_voc_val.record")

        mc.ANCHOR_BOX, mc.ANCHORS = imdb.get_anchor_box_from_dataset(
            mc, dataset_train_path, keys_to_features)

        # create a new dataset with preprocessed/filtered records
        if (mc.REDUCE_DATASET):
            if (not mc.ALREADY_PREPROCESSED):
                imdb.reduce_dataset_by_class(mc,
                                             keys_to_features,
                                             dataset_set="train")
                if (eval_graph):
                    eval_mc = edict(mc.copy())
                    # eval_mc.BATCH_SIZE = 1
                    eval_mc.IS_TRAINING = False
                    eval_mc.DATA_AUGMENTATION = False
                    mc.EVAL_ITERS = imdb.reduce_dataset_by_class(
                        eval_mc, keys_to_features, dataset_set="val")
                    eval_mc.EVAL_ITERS = mc.EVAL_ITERS
                    print("EVAL ITERS :%d" % (mc.EVAL_ITERS))
            else:
                pass
                # with open(os.path.join(mc.TRAIN_DIR, "BGR_MEANS.txt"), "r") as f:
                #   mc.BGR_MEANS = np.fromstring(f.readline().split("[[[")[1].split("]]]")[0], sep =" ")
            if (mc.PREPROCESSED_DATA_DIR):
                dataset_train_path = os.path.join(
                    mc.PREPROCESSED_DATA_DIR,
                    "preprocessed_pascal_voc_train.record")
                dataset_eval_path = os.path.join(
                    mc.PREPROCESSED_DATA_DIR,
                    "preprocessed_pascal_voc_val.record")
            else:
                dataset_train_path = os.path.join(
                    mc.TRAIN_DIR, "preprocessed_pascal_voc_train.record")
                dataset_eval_path = os.path.join(
                    mc.TRAIN_DIR, "preprocessed_pascal_voc_val.record")
        elif eval_graph:
            mc.EVAL_ITERS = imdb.get_num_images(mc,
                                                keys_to_features,
                                                dataset_set="val")
        # prepare training dataset
        if train_graph:
            with train_graph.as_default():
                dataset_train = tf.contrib.data.make_batched_features_dataset(
                    dataset_train_path,
                    mc.BATCH_SIZE,
                    keys_to_features,
                    num_epochs=None,
                    reader_num_threads=mc.NUM_THREADS / 2,
                    parser_num_threads=mc.NUM_THREADS / 2,
                    shuffle_buffer_size=12000,
                    shuffle=True,
                    sloppy_ordering=True)
                it_train = dataset_train.make_one_shot_iterator()
                train_list = imdb.load_data(it_train.get_next(),
                                            mc,
                                            training=True,
                                            image_decoder=tf.image.decode_jpeg)
        else:
            train_list = None
        # prepare evaluation dataset
        if (eval_graph):
            with eval_graph.as_default():
                dataset_eval = tf.contrib.data.make_batched_features_dataset(
                    dataset_eval_path,
                    mc.BATCH_SIZE,
                    keys_to_features,
                    num_epochs=None,
                    reader_num_threads=mc.NUM_THREADS / 2,
                    parser_num_threads=mc.NUM_THREADS / 2,
                    shuffle=False)
                it_eval = dataset_eval.make_one_shot_iterator()
                eval_mc = edict(mc.copy())
                eval_mc.IS_TRAINING = False
                eval_mc.DATA_AUGMENTATION = False
                eval_list = imdb.load_data(it_eval.get_next(),
                                           eval_mc,
                                           training=False,
                                           image_decoder=tf.image.decode_jpeg)
        else:
            eval_list = None

        return train_list, eval_list, mc
예제 #13
0
파일: lstm.py 프로젝트: MasazI/DeepLearning
from collections import OrderedDict
import cPickle as pkl
import sys
import time

import numpy as np
import theano
import theano.tensor as T
from theano import config
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import imdb

# データセット
datasets = {"imdb": (imdb.load_data(path="data/imdb/imdb.pkl"), imdb.prepare_data)}

# random number generators seeds for consistency
SEED = 123
np.random.seed(SEED)

# floatX型の配列の取得
def numpy_floatX(data):
    """ 配列をtheanoのfloatX型へ変換する  """
    return np.asarray(data, dtype=config.floatX)


# データセットの取得
def get_dataset(name):
    return datasets[name][0], datasets[name][1]
예제 #14
0
파일: lstm.py 프로젝트: MasazI/DeepLearning
from collections import OrderedDict
import cPickle as pkl
import sys
import time

import numpy as np
import theano
import theano.tensor as T
from theano import config
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import imdb

# データセット
datasets = {'imdb': (imdb.load_data(path='data/imdb/imdb.pkl'), imdb.prepare_data)}

# random number generators seeds for consistency
SEED = 123
np.random.seed(SEED)

# floatX型の配列の取得
def numpy_floatX(data):
    ''' 配列をtheanoのfloatX型へ変換する  '''
    return np.asarray(data, dtype=config.floatX)

# データセットの取得
def get_dataset(name):
    return datasets[name][0], datasets[name][1]

# モデルのzip
예제 #15
0
from keras.layers import Dense
from keras import optimizers
from keras import losses
from keras import metrics


def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.

    return results


(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(num_words=10000)

# Get word_index and decode reviews
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])
decoded_review = ''.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[0]])

# Preparing data into binary matrix
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
예제 #16
0
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import imdb

imdb.maybe_download_and_extract()

input_text_train, target_train = imdb.load_data(train=True)
input_text_test, target_test = imdb.load_data(train=False)

print("Size of the trainig set: ", len(input_text_train))
print("Size of the testing set:  ", len(input_text_test))

text_data = input_text_train + input_text_test

print('Sample example from the training set...')
print(input_text_train[1])

print('Sample example actual sentiment...')
print(target_train[1])

#Include only the popular ones
num_top_words = 10000
예제 #17
0
def get_imdb(n_words=100000, varid_portion=0.1, maxlen=None):
	""" Get of Large Movie Review datasets """
	return imdb.load_data(path="./imdb.pkl", n_words = n_words, valid_portion=varid_portion, maxlen=maxlen)
예제 #18
0
def coco(mc, train_graph, eval_graph):
    with tf.name_scope("COCO_input"):
        keys_to_features = imdb.get_keys_to_features()

        # return tf.parse_example(batch_records, keys_to_features)
        dataset_train_path = os.path.join(mc.DATA_PATH, "coco_train.record")
        dataset_eval_path = os.path.join(mc.DATA_PATH, "coco_val.record")

        # create a new dataset with preprocessed/filtered records
        if (mc.REDUCE_DATASET and not mc.ALREADY_PREPROCESSED):
            imdb.reduce_dataset_by_class(mc,
                                         keys_to_features,
                                         dataset_set="train")
            if (eval_graph):
                eval_mc = edict(mc.copy())
                # eval_mc.BATCH_SIZE = 1
                eval_mc.IS_TRAINING = False
                eval_mc.DATA_AUGMENTATION = False
                mc.EVAL_ITERS = imdb.reduce_dataset_by_class(eval_mc,
                                                             keys_to_features,
                                                             dataset_set="val")
                eval_mc.EVAL_ITERS = mc.EVAL_ITERS
                dataset_train_path = os.path.join(
                    mc["BASE_DIR"], "preprocessed_" + mc.DATASET_NAME.lower() +
                    "_train.record")
                dataset_eval_path = os.path.join(
                    mc["BASE_DIR"],
                    "preprocessed_" + mc.DATASET_NAME.lower() + "_val.record")
                print("EVAL ITERS :%d" % (mc.EVAL_ITERS))

        if (mc.REDUCE_DATASET and mc.PREPROCESSED_DATA_DIR):
            dataset_train_path = os.path.join(
                mc["PREPROCESSED_DATA_DIR"],
                "preprocessed_" + mc.DATASET_NAME.lower() + "_train.record")
            dataset_eval_path = os.path.join(
                mc["PREPROCESSED_DATA_DIR"],
                "preprocessed_" + mc.DATASET_NAME.lower() + "_val.record")

        # get anchor boxes before creating the input graph
        mc.ANCHOR_BOX, mc.ANCHORS = imdb.get_anchor_box_from_dataset(
            mc, dataset_train_path, keys_to_features)

        # prepare training dataset
        if train_graph:
            with train_graph.as_default():
                dataset_train = tf.contrib.data.make_batched_features_dataset(
                    dataset_train_path,
                    mc.BATCH_SIZE,
                    keys_to_features,
                    num_epochs=mc.TRAIN_EPOCHS,
                    reader_num_threads=8,
                    parser_num_threads=8,
                    shuffle_buffer_size=13000 if mc.IS_TRAINING else 512,
                    sloppy_ordering=True)
                it_train = dataset_train.make_one_shot_iterator()
                train_list = imdb.load_data(it_train.get_next(),
                                            mc,
                                            training=True,
                                            image_decoder=tf.image.decode_jpeg)
        else:
            train_list = None

        # prepare evaluation dataset
        if eval_graph:
            with eval_graph.as_default():
                eval_mc = edict(mc.copy())
                # eval_mc.BATCH_SIZE = 1
                eval_mc.IS_TRAINING = False
                eval_mc.DATA_AUGMENTATION = False
                dataset_eval = tf.contrib.data.make_batched_features_dataset(
                    dataset_eval_path,
                    eval_mc.BATCH_SIZE,
                    keys_to_features,
                    num_epochs=None,
                    reader_num_threads=8,
                    parser_num_threads=8,
                    shuffle=False,
                    drop_final_batch=True)
                it_eval = dataset_eval.make_one_shot_iterator()
                eval_list = imdb.load_data(it_eval.get_next(),
                                           eval_mc,
                                           training=False,
                                           image_decoder=tf.image.decode_png)
        else:
            eval_list = None

    return train_list, eval_list, mc
예제 #19
0
파일: main.py 프로젝트: giahy2507/lstm-lstm
    return zip(range(len(minibatches)), minibatches)


if __name__ == "__main__":

    vocab_size = 10000
    embsize = 128
    hiddensize = 128
    nstep = 100
    mini_batch_size = 16
    max_epochs = 20
    np.random.seed(4488)


    print('Loading data')
    train, valid, test = imdb.load_data(n_words=10000, valid_portion=0.05,maxlen=100)
    train = test
    ydim = np.max(train[1]) + 1

    print('Build model ')
    X,Mask,Y,\
    cost,err, \
    train_function, valid_function, predict_function = build_model(vocab_size=vocab_size,
                                                                    embsize=embsize,
                                                                    hiddensize=hiddensize)

    print('Training ')
    for eidx in range(max_epochs):
        kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True)
        costs = []
        errs = []
예제 #20
0
def load_my_data():
    train_data, valid_data, test_data = load_data(n_words=n_words)
    train_x, train_y = train_data
    print train_x[0], train_y[0]
    return train_data, valid_data, test_data
예제 #21
0
from imdb import load_data, prepare_data
import numpy as np
import pickle as pkl

train, valid, test = load_data(n_words=10, valid_portion=0.05)
x = [train[0][t] for t in range(0, len(train[0]))]
y = [train[1][t] for t in range(0, len(train[1]))]
x, mask, y = prepare_data(x, y)
y = np.array(y)
feat_train = np.zeros((x.shape[0], x.shape[1], 10))
for i in range(0, x.shape[0]):
    print "num: " + str(i)
    for j in range(0, x.shape[1]):
        feat_train[i][j][x[i][j]] = 1
np.save("data/feats_train.npy", feat_train)
np.save("data/labels_train.npy", y)
np.save("data/mask_train.npy", mask)
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout
import imdb

max_features = 20000
max_length = 80
embedding_dim = 256
batch_size = 128
epochs = 10
modes = [0, 1, 2]

print('Loading data...')
# 这里需要上传数据,load_data的函数里面的参数需要了解下
get_file_path = "D:\用户目录\Desktop\郭磊\keras\imdb.npz"
(X_train, y_train), (X_test, y_test) = imdb.load_data(get_file_path,
                                                      num_words=max_features)

X_train = sequence.pad_sequences(X_train, max_length)
X_test = sequence.pad_sequences(X_test, max_length)
# 没有介绍如何对标签的处理
# Compile and train different models while measuring performance.
results = []
for mode in modes:
    print('Testing mode: implementation={}'.format(mode))

    model = Sequential()
    model.add(Embedding(max_features, embedding_dim, input_length=max_length))
    model.add(Dropout(0.2))
    model.add(
        LSTM(embedding_dim,
             dropout=0.2,
예제 #23
0
파일: main.py 프로젝트: giahy2507/lstm-lstm
    return zip(range(len(minibatches)), minibatches)


if __name__ == "__main__":

    vocab_size = 10000
    embsize = 128
    hiddensize = 128
    nstep = 100
    mini_batch_size = 16
    max_epochs = 20
    np.random.seed(4488)

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=10000,
                                        valid_portion=0.05,
                                        maxlen=100)
    train = test
    ydim = np.max(train[1]) + 1

    print('Build model ')
    X,Mask,Y,\
    cost,err, \
    train_function, valid_function, predict_function = build_model(vocab_size=vocab_size,
                                                                    embsize=embsize,
                                                                    hiddensize=hiddensize)

    print('Training ')
    for eidx in range(max_epochs):
        kf = get_minibatches_idx(len(train[0]), mini_batch_size, shuffle=True)
        costs = []
예제 #24
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import keras.datasets import imdb

(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words=10000)

print(max([max(sequence) for sequence in train_data]))


# In[2]:


from keras.datasets import imdb

(train_data, train_labels),(test_data, test_labels) = imdb.load_data(num_words=10000)

print(max([max(sequence) for sequence in train_data]))


# In[3]:


word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = '',join([reverse_word_index.get(i-3, '?') for i in train_data[0]])

예제 #25
0
It is being called by main.py.
"""
import os
#import dash
import dash_html_components as html
import dash_core_components as dcc
#from dash.dependencies import Input, Output
import imdb

DATA_DIR = 'data'
PROCESSED_DIR = 'processed'
IMDB_FILE = 'imdb_df.csv'
GENRES_FILE = 'set_genres.pkl'
IMDB_PATH = os.path.join(DATA_DIR, PROCESSED_DIR, IMDB_FILE)
GENRES_PATH = os.path.join(DATA_DIR, PROCESSED_DIR, GENRES_FILE)
df_imdb = imdb.load_data(IMDB_PATH)
genres = imdb.load_genres(GENRES_PATH)

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

tab2_layout = html.Div([
    html.Div([
        html.Label('Filter by:'),
        dcc.Checklist(id='filter-checklist',
                      options=[{
                          'label': 'Genre',
                          'value': 'Genre'
                      }, {
                          'label': 'Year',
                          'value': 'Year'
                      }],
# Convolution
filter_length = 3
nb_filter = 100

# LSTM
lstm_output_size = 64

# vanilla layer
hidden_dims = 64

# Training
batch_size = 32
nb_epoch = 30

print("Loading data...")
train, valid, test = imdb.load_data(nb_words=max_features, valid_portion=0.0)
print(len(train[0]), 'train sequences')
print(len(valid[0]), 'valid sequences')
print(len(test[0]), 'test sequences')

X_train, y_train = train
X_valid, y_valid = valid
X_test, y_test = test

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_valid = sequence.pad_sequences(X_valid, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)
print('X_test shape:', X_test.shape)
예제 #27
0
import preprocess as pre
import imdb
import numpy as np
import matplotlib.pyplot as plt
import gc
import InitializeModel as im
import pickle

print('reading vocabulary list')
word_index = imdb.get_word_index()
# word_index = imdb.get_filtered_word_index()
print('reading IMDB_data')
(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(path='imdb',
                                                           num_words=10000)
data = np.concatenate((train_data, test_data), axis=0)
targets = np.concatenate((train_labels, test_labels), axis=0)
'''
optional:Analyzing Dataset
'''
print("Categories:", np.unique(targets))
print("Number of unique words:", len(np.unique(np.hstack(data))))

# 将word_index反转,实现将整数索引到单词的映射

reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])

decoded_review = ' '.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print('decoded sample: ')
예제 #28
0
def train_lstm(
    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=50,  # The maximum number of epoch to run
    dispFreq=10,  # Display to stdout the training progress every N updates
    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder='lstm',  # TODO: can be removed must be lstm.
    saveto='save/lstm_model.npz',  # The best model will be saved there
    validFreq=370,  # Compute the validation error after this number of update.
    saveFreq=50,  # Save the parameters after every saveFreq updates
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
                       # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
    test_size=-1,  # If >0, we keep only this number of test example.
):

    # Model options
    model_options = locals().copy()
    print("model options", model_options)

    #load_data, prepare_data = get_dataset(dataset)

    print('Loading data')
    train, valid, test = imdb.load_data(n_words=n_words, valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = np.arange(len(test[0]))
        np.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = np.max(train[1]) + 1

    model_options['ydim'] = ydim

    print('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask,
     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = T.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U'] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = T.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=list(tparams.values()))
    f_grad = T.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads,
                                        x, mask, y, cost)

    print('Optimization')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print("%d train examples" % len(train[0]))
    print("%d valid examples" % len(valid[0]))
    print("%d test examples" % len(test[0]))

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) // batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) // batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    try:
        for eidx in range(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t]for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = imdb.prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if np.isnan(cost) or np.isinf(cost):
                    print('bad cost detected: ', cost)
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost)

                if saveto and np.mod(uidx, saveFreq) == 0:
                    print('Saving...')

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(saveto, history_errs=history_errs, **params)
                    pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print('Done')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, imdb.prepare_data, train, kf)
                    valid_err = pred_error(f_pred, imdb.prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test)

                    history_errs.append([valid_err, test_err])

                    if (best_p is None or
                        valid_err <= np.array(history_errs)[:,
                                                               0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print('Train ', train_err, 'Valid ', valid_err,
                           'Test ', test_err)

                    if (len(history_errs) > patience and
                        valid_err >= np.array(history_errs)[:-patience,
                                                               0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print('Early Stop!')
                            estop = True
                            break

            print('Seen %d samples' % n_samples)

            if estop:
                break

    except KeyboardInterrupt:
        print("Training interupted")

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, imdb.prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, imdb.prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, imdb.prepare_data, test, kf_test)

    print( 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err )
    if saveto:
        np.savez(saveto, train_err=train_err,
                    valid_err=valid_err, test_err=test_err,
                    history_errs=history_errs, **best_p)
    print('The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))))
    print( ('Training took %.1fs' %
            (end_time - start_time)), file=sys.stderr)
    return train_err, valid_err, test_err
예제 #29
0
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import imdb
import helper_functions

# Data Import
#------------------------------------------------------------------------------

imdb.maybe_download_and_extract()

x_train_text, y_train = imdb.load_data(
    train=True)  # I added utf-8 encoding in the code in imdb
x_test_text, y_test = imdb.load_data(train=False)

data_text = x_train_text + x_test_text

# Tokenizer
#------------------------------------------------------------------------------

num_words = 30000
tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(data_text)

x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)