예제 #1
0
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)
예제 #2
0
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from pickle import dump

def define_model(vocab_size, seq_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 50, input_length=seq_length))
  model.add(LSTM(100, return_sequences=True))
  model.add(LSTM(100))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(vocab_size, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  return model

doc = load_doc('republic_sequences.txt')
lines = doc.split('\n')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

vocab_size = len(tokenizer.word_index) + 1
print('Vocab size is: {:d}'.format(vocab_size))

sequences = np.array(sequences)
X = sequences[:,:-1]
Y = sequences[:,-1]
Y = to_categorical(Y, num_classes=vocab_size)
seq_length = X.shape[1]
예제 #3
0
from utils import load_doc, save_sequences
import numpy as np

raw_text = load_doc('data/rhyme.txt')

# remove newlines and re-join characters into one long sequence
tokens = raw_text.split()
raw_text = ' '.join(tokens)

# Create sequences. Each input sequence 11 chars long; 10 chars as input, 1 for output
length = 10
sequences = list()

for i in range(length, len(raw_text)):
    seq = raw_text[i - length:i + 1]
    sequences.append(seq)

print("Total sequences: {:d}".format(len(sequences)))

save_sequences(sequences, 'char_sequences.txt')
예제 #4
0
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import load_model

def base_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 100, input_length=max_length))
  model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(10, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

trainX, ytrain = load_clean_dataset(vocab, True)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainX)
vocab_size = len(tokenizer.word_index) + 1
print("[INFO] Vocab size: {:d}".format(vocab_size))

max_length = max([len(s.split()) for s in trainX])
print("[INFO] Max length: {:d}".format(max_length))

encoded = tokenizer.texts_to_sequences(trainX)
Xtrain = pad_sequences(encoded, maxlen=max_length, padding='post')
print("[INFO] Xtrain shape: {}, ytrain shape: {}".format(Xtrain.shape, ytrain.shape))
예제 #5
0
# extract features from each photo in the directory
def extract_features(filename):
    # load the model
    model = VGG16()
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # load the photo
    image = load_img(filename, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get features
    feature = model.predict(image, verbose=0)
    return feature


# load the tokenizer
tokenizer = load(open(TOKEN, 'rb'))
# pre-define the max sequence length (from training)
max_length = int(load_doc(MAX_LENGTH))
# load the model
model = load_model(MODEL_FILE)
# load and prepare the photograph
photo = extract_features(TEST_NEW_IMAGE)
# generate description
description = generate_desc(model, tokenizer, photo, max_length)
print(description)
예제 #6
0
from keras.callbacks import LambdaCallback


def on_epoch_end(epoch, _):
    print()
    if epoch % 10 == 0:
        print('----- Generating text after Epoch: {}'.format(epoch))
        for i in range(7):
            sampled_indices = sample(model,
                                     char_to_ix,
                                     seq_length=27,
                                     n_chars=50)
            print_sample(sampled_indices, ix_to_char)


data = load_doc("./data/dinos.txt")
data = data.lower()
chars = sorted(list(set(data)))

char_to_ix = {c: i for i, c in enumerate(chars)}
print(char_to_ix)
ix_to_char = {i: c for i, c in enumerate(chars)}
vocab_size = len(chars)
print('[INFO] Vocab size: {:d}'.format(vocab_size))

# Prepare inputs (we're sweeping from left to right in steps seq_length long)
seq_length = 27
X = list()
Y = list()
sequencesIn = list()
sequencesOut = list()
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape(
            (1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        image_id = name.split('.')[0]
        features[image_id] = feature
        print('>%s' % name)
    return features


# extract features from all images
directory = 'dataset/Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

filename = 'dataset/Flickr8k.token.txt'
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
clean_descriptions(descriptions)
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, 'descriptions.txt')
예제 #8
0
from utils import load_doc
from pickle import dump
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

def define_model(X, vocab_size):
  model = Sequential()
  model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
  model.add(Dense(vocab_size, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
  model.summary()
  return model

raw_text = load_doc('char_sequences.txt')
lines = raw_text.split('\n')

chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
vocab_size = len(mapping)
print("Vocab size: {:d}".format(vocab_size))

sequences = list()
for line in lines:
  encoded_seq = [mapping[char] for char in line]
  sequences.append(encoded_seq)

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:, -1]