예제 #1
0
def vectorizer(text):

    tokenizer = pickle.load(open('./tokenizer.pkl', 'rb'))
    text = [text]
    text = tokenizer.texts_to_sequences(text)
    text = sequence.pad_sequences(text, maxlen=MAX_SEQUENCE_LENGTH)

    return text
예제 #2
0
def convert_text(string_text, to_rgb=False):
    if not isinstance(string_text, list):
        string_text = [string_text]
    with open('save_tokenizer/tokenizer3.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    string_seq = tokenizer.texts_to_sequences(string_text)
    string_pad = sequence.pad_sequences(string_seq, maxlen=686)
    text_converted = string_pad  # string_seq

    print('Text_converted: ', text_converted)
    return text_converted
예제 #3
0
import os

# Supress warning and informational messages
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

NUM_WORDS = 6000  # the top most n frequent words to consider
SKIP_TOP = 0  # Skip the most words that are likely (the, and, a)
MAX_REVIEW_LEN = 400  # Max number of words from a review

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=NUM_WORDS,
                                                      skip_top=SKIP_TOP)

# print a sample
# print("econded word sequence:", x_train[3])

x_train = sequence.pad_sequences(x_train, maxlen=MAX_REVIEW_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_REVIEW_LEN)

print("x_train.shape", x_train.shape, "x_test.shape:", x_test.shape)

model = Sequential()
model.add(Embedding(NUM_WORDS, 64))
model.add(LSTM(128))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

BATCH_SIZE = 24
EPOCHS = 5
예제 #4
0
y_train = le.transform(data_train.category)
y_test = le.transform(data_test.category)
Y_train = np_utils.to_categorical(y_train)
Y_test = np_utils.to_categorical(y_test)

## Tokenize text
logging.info("Tokenizing text...")
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(data_train.text)
x_train = tokenizer.texts_to_sequences(data_train.text)
x_test = tokenizer.texts_to_sequences(data_test.text)

## Pad sequences
logging.info("Transforming tokens into sequences...")
max_input_size = len(max(x_train, key = len))
X_train = sequence.pad_sequences(x_train, maxlen = max_input_size)
X_test = sequence.pad_sequences(x_test, maxlen = max_input_size)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)
X_train_multi = []
X_test_multi = []
for i in range(len(kernel_size)):
    X_train_multi.append(X_train)
    X_test_multi.append(X_test)

## Build model
# 1. Embeddings layer
inputs = Input(shape = (max_input_size, ))
x = Embedding(vocab_size, embedding_dims)(inputs)
# 2. Convolutional channels for n-grams
channels = []
vectorizer = CountVectorizer()
sentence_enISEAR = vectorizer.fit_transform(sentence_enISEAR)

# Preprocessing dataset
# To lower case, remove , and .
data = data.str.lower().str.replace(".", "").str.replace(",", "")

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
tokenizer.fit_on_texts(data)
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, vocab_size)

data_enISEAR = tokenizer.texts_to_sequences(data)
data_padded = sequence.pad_sequences(data_enISEAR,
                                     maxlen=MAX_SEQUENCE_LENGTH,
                                     padding='post')

embedding_path = '../../../../../Downloads/glove.6B.300d.txt'
# embedding_matrix = util.embedding.prepareEmbeddings(word_index, MAX_NUM_WORDS, embedding_path)
# util.embedding.saveEmbedding(embedding_matrix, EMBEDDING_FILEe)
print('Loading embedding' + EMBEDDING_FILE)
embedding_matrix = util.embedding.loadEmbedding(EMBEDDING_FILE)

class_weight = {
    0: 1.131,
    1: 1.000,
    2: 1.903,
    3: 5.107,
    4: 2.019,
    5: 3.338,
    print('   Folds        :', KFOLDS)
    print('   Runs         :', ROUNDS)
print('-------------------------------\n')

# Tokenize and create word index
print('INFO: Loading Dataset')
instances = instances.str.lower().str.replace('.', '').str.replace(',', '')
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
tokenizer.fit_on_texts(instances)
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, vocab_size)

instances_sequences = tokenizer.texts_to_sequences(instances)
instances_padded = sequence.pad_sequences(instances_sequences,
                                          maxlen=MAX_SEQUENCE_LENGTH,
                                          padding='post')

if (args.testset):
    instances_test = instances_test.str.lower().str.replace('.',
                                                            '').str.replace(
                                                                ',', '')
    instances_sequences_test = tokenizer.texts_to_sequences(instances_test)
    instances_padded_test = sequence.pad_sequences(instances_sequences_test,
                                                   maxlen=MAX_SEQUENCE_LENGTH,
                                                   padding='post')

# Prepare embedding if not present yet
EMBEDDING_DIMS = 300
if (args.createembedding):
    if (not args.createembedding.endswith('.npy')):
예제 #7
0
from keras.preprocessing.sequence import sequence #这是一个sequence数据处理的库,用来对数据进行一些操作

"""----------数据准备----------"""
max_features = 10000
maxlen = 500
batch_size = 32
print('Loading data...')

#获取到数据
(input_train, y_train), (input_test, y_test) = imdb.load_data( num_words=max_features)
print(len(input_train), 'train sequences')#打印数据个数,一共25000调评论,单词都化成数字了,是一个25000xn的列表,其中n长度不定
print(len(input_test), 'test sequences')
print('Pad sequences (samples x time)')
#数据整形
input_train =sequence.pad_sequences(input_train, maxlen=maxlen)#这里截取的是25000条评论各自的长度,而不是取25000里面500个。?所以说sequence截断的二级维度上的
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

"""----------网络准备----------"""
"embedding layer是一个可变的字典映射,将单词(已经化成整数)映射到一个空间里,使得能够保留之间的关系,10000表示这个空间只接受前10000个常用的,这个层好像只适合用于文本类的信息处理"
model=models.Sequential()
model.add(layers.Embedding(max_features,32))#max_fetures是10000个常用的单词,这里的32不知道是什么,可能是数据组数,那后面的batch_size有什么用呢
model.add(layers.SimpleRNN(32))
model.add(layers.Dense(1,activation='sigmoid'))
model.summary()#打印一下模型

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc']
    print('   Folds        :', KFOLDS)
    print('   Runs         :', ROUNDS)
print('---------------------------------------------------------- \n')


# Preprocessing dataset
# To lower case, remove , and .
text_instances = text_instances.str.lower().str.replace(".","").str.replace(",","")
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
tokenizer.fit_on_texts(text_instances)
vocab_size = len(tokenizer.word_index) + 1
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, vocab_size)

text_instances = tokenizer.texts_to_sequences(text_instances)
text_instances_padded = sequence.pad_sequences(
            text_instances, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

if (args.testset):
    text_instances_test = text_instances_test.str.lower().str.replace(".","").str.replace(",","")
    text_instances_test = tokenizer.texts_to_sequences(text_instances_test)
    text_instances_padded_test = sequence.pad_sequences(
        text_instances_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

if (args.annotate):
    text_instances_annotate = text_instances_annotate.str.lower().str.replace(".","").str.replace(",","")
    text_instances_annotate = tokenizer.texts_to_sequences(text_instances_annotate)
    text_instances_padded_annotate = sequence.pad_sequences(
        text_instances_annotate, maxlen=MAX_SEQUENCE_LENGTH, padding='post')


# Prepare embedding