Python get_word_index示例，tensorflow.keras.datasets.imdb.get_word_index Python示例

示例#1

0

显示文件

文件： lab6.py 项目： Ksenox/ANN-2021

import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

(X_train, y_train), (X_test, y_test) = imdb.load_data()

(training_data,
 training_targets), (testing_data,
                     testing_targets) = imdb.load_data(num_words=500)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join([reverse_index.get(i - 3, "#") for i in data[0]])
print(decoded)


def plot_loss(loss, v_loss):
    plt.figure(1, figsize=(8, 5))
    plt.plot(loss, 'b', label='train')
    plt.plot(v_loss, 'r', label='validation')
    plt.title('Loss')
    plt.ylabel('loss')
    plt.xlabel('epochs')
    plt.legend()
    plt.show()
    plt.clf()

示例#2

0

显示文件

embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# To create the embedding matrix:

vocabulary_size = 10000

word2id = imdb.get_word_index()   # dictionary from words to integers (the id of the word in the vocab)
id2word = {i: word for word, i in word2id.items()}
# Embedding matrix holds the vector representation for the words.
embedding_matrix = np.zeros((vocabulary_size, 100)) # 50000
for word, index in word2id.items():
   if index > vocabulary_size - 1:
       continue
   else:
       embedding_vector = embeddings_index.get(word)
       if embedding_vector is not None:
           embedding_matrix[index] = embedding_vector

# for i in range(0, 3):
#    print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[i], embedding_matrix[i]))

(X_train_full, y_train_full), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)

示例#3

0

显示文件

文件： 3.5-classifying-movie-reviews.py 项目： pjleimbigler/deep-learning-with-python-notebooks

# Wait, so Keras' official data-loading helper function gives us a *numpy array of python lists?* I realize that performance is not super important, but why not just a 2D array?

# Sanity check our training data
train_data[0][0:10]

train_labels[0]

# Since we restricted ourselves to the top 10,000 most frequent words, no word index will exceed 10,000:

max([max(sequence) for sequence in train_data])

# For kicks, here's how you can quickly decode one of these reviews back to English words:

# word_index is a dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([reverse_word_index.get(i - 3, '�') for i in train_data[0]])

decoded_review


def decode_review(arg):
    # We decode the review; note that our indices were offset by 3
    # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
    
    if isinstance(arg, int):
        return ' '.join([reverse_word_index.get(i - 3, '�') for i in train_data[arg]])

示例#4

0

显示文件

    def run_imdb():
        # Extract useful data from dataset
        print('Extracting the IMDB dataset')
        (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

        # Illustration of the input data
        print(
            f'In this dataset a label of 1 indicates a positive review, 0 a negative review.\nHaving taken the top 10,000'
            f' most-used words no word index will exceed 10,000.\nMax Index = '
            f'{max([max(sequence) for sequence in train_data])}')

        print(
            f"For the sake of illustration, let's decode a review back to English (not being printed for easier reading)")
        word_index = imdb.get_word_index()
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
        # print(decoded_review)

        # Encoding the inputs
        print("In order to pass these lists of integers into a neural network we must first encode them as tensors of "
              "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.")

        def vectorise_sequences(sequences, dimension=10000):
            ret = np.zeros((len(sequences), dimension))
            for i, sequence in enumerate(sequences):
                ret[i, sequence] = 1
                if i < 1:
                    print(f"\n{sequence} => {ret[i]}\n")
            return ret

        x_train = vectorise_sequences(train_data)
        y_train = np.asarray(train_labels).astype('float32')
        x_test = vectorise_sequences(test_data)
        y_test = np.asarray(test_labels).astype('float32')

        # Design and compile the model
        print("Now to build the network, this time using parameters with greater configurability")
        model = models.Sequential()
        model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
        model.add(layers.Dense(16, activation='relu'))
        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy',
                      metrics=[metrics.binary_accuracy])

        # Divide the training data
        print("Creating a validation set for greater insight during training")
        x_val = x_train[:10000]  # Taking the 1st 10000 samples for validation
        partial_x_train = x_train[10000:]  # Leaving everything from 10000 onwards for training
        y_val = y_train[:10000]  # Taking the 1st 10000 labels for validation
        partial_y_train = y_train[10000:]  # Leaving everything from 10000 onwards for training

        # Train the model
        print("Begin training the model:")
        history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
        history_dict = history.history

        print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. "
              f"The keys are: {history_dict.keys()}")  # ['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy']

        # Prepare to plot the training and validation information
        loss_values = history_dict['loss']
        val_loss_values = history_dict['val_loss']
        acc_values = history_dict['binary_accuracy']
        val_acc_values = history_dict['val_binary_accuracy']

        epochs = range(1, len(history_dict['binary_accuracy']) + 1)
        plt.plot(epochs, loss_values, 'bo', label='Training Loss')
        plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

        plt.clf()
        plt.plot(epochs, acc_values, 'bo', label='Training Accuracy')
        plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy')
        plt.title('Training and Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

        # Evaluate the model
        print("\nAfter reviewing each plot, evaluate the performance of the model on new data")
        results = model.evaluate(x_test, y_test)
        print(f"Evaluation Results: Loss = {results[0]}    Accuracy = {results[1] * 100}%")

示例#5

0

显示文件

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters
min_word_count = 1
context = 10

imdb = tf.keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000,start_char=None,
                                                              oov_char=None, index_from=None)      
x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

vocabulary = imdb.get_word_index()
vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
vocabulary_inv[0] = "<PAD/>"

embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)


#teacher-predictions
#teacher = 'lstm'
#def get_teacher_predictions(teacher):
#  if teacher == 'lstm':
#    return y_train

示例#6

0

显示文件

from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
import pandas
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy

(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=666,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=666)

word_to_id = imdb.get_word_index()
word_to_id = {k: (v + 3) for k, v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value: key for key, value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[0]))

model = keras.Sequential()
model.add(Dense(500, input_dim=666, activation='tanh'))
model.add(Dense(200, activation='tanh'))
model.add(Dense(1, name="output_layer", activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='AdaDelta',

示例#7

0

显示文件

文件： 1.dataProcess.py 项目： Lee-JeongMin/TIL

    return pre_proc_text


# IMDB 데이터에 사용된 총 단어의 종류는 88,584개 (vocabulary 크기)이다.
# 가장 많이 사용되는 6,000개 단어만 사용하고, 나머지는 OOV로 표시한다.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=6000,
                                                      start_char=0,
                                                      oov_char=0,
                                                      index_from=0)

# train, test 데이터를 합친다. 필요한 경우 나중에 나눠쓴다.
text = np.hstack([x_train, x_test])
label = np.hstack([y_train, y_test])

# vocabulary를 가져온다.
word2idx = imdb.get_word_index()
idx2word = dict((v, k) for k, v in word2idx.items())

# start_char와 oov_char는 '.'으로 표시해 둔다. 나중에 전처리 과정에서 제거된다.
idx2word[0] = '.'


# 숫자로 표시된 x_train을 실제 단어로 변환한다.
def decode(review):
    x = [idx2word[s] for s in review]
    return ' '.join(x)


# 리뷰 문서를 전처리한다.
reviews = []
for i, review in enumerate(text):

示例#8

0

显示文件

文件： prac2_imdb_classification.py 项目： a0935210570602/2020coding365

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1zs--3ULwCynfqLilHDzTUEoPb51c_qTT
"""

import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
from tensorflow.keras import models,layers
from tensorflow import keras 
import tensorflow as tf

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # Keep only top 10,000 most frequently words

word_index = imdb.get_word_index() # word_index is a dictionary mapping words to an integer index.
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Reverses it, mapping integer indices to words

# Decodes the review to read the content. Note that the indcies are offset by 3,
# becuase 0, 1, 2 are reserved indices for "padding", "start of sequence," and "Unknown" 
decoded_review = ' '.join([reverse_word_index.get(i-3, '?') for i in train_data[100]])
print(decoded_review)

# one-hot encoding
def vectorize_sequences(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension))
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1
  return results

x_train = vectorize_sequences(train_data)

示例#9

0

显示文件


# Trainings und Testdaten werden über Keras geladen
# Alternativ können Sie direkt die Datei als Pikle Datei herunterladen
(X_train, y_train), (X_test,
                     y_test) = imdb.load_data(path="imdb.npz",
                                              num_words=VOCABULARY_SIZE,
                                              skip_top=0,
                                              maxlen=None,
                                              seed=113,
                                              start_char=START_CHAR,
                                              oov_char=2,
                                              index_from=INDEX_FROM)

# Die Datei wird imdb_word_index.json heruntergeladen
word_to_id = imdb.get_word_index(path="./imdb_word_index.json")

# Hier werden die korrekten Indizes mit dem passenden Wort gespeichert, da es eine Index-Verschiebung von +3 gibt (siehe Erklärung in
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification)
# Aus: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = START_CHAR  # 1
word_to_id["<UNK>"] = 2
id_to_word = {value: key for key, value in word_to_id.items()}

# Zeigt den Inhalt einer Rezension (bestimmt durch REVIEW_INDEX)
REVIEW_INDEX = 2
print(X_train[REVIEW_INDEX])
print("---- Rezensionstext --------- ")
print(' '.join(id_to_word[id] for id in X_train[REVIEW_INDEX]))

示例#10

0

显示文件

文件： main.py 项目： VangaPoornaChand/IMDB-Movie-review-Positive-or-Negative-Classification

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.datasets import imdb
import os

word_indexes = imdb.get_word_index()
word_indexes = {k: (v + 2) for k, v in word_indexes.items()}
word_indexes["<PAD>"] = 0
word_indexes["<START>"] = 1
word_indexes["<UKN>"] = 2
reverse_word_indexes = {v: k for k, v in word_indexes.items()}


def load_data():
    (X1, y1), (X2, y2) = imdb.load_data()
    return X1, y1, X2, y2


def preprocess_data(train_data, test_data):
    train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                            maxlen=512,
                                                            value=0,
                                                            padding='post')
    test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                           maxlen=512,
                                                           value=0,
                                                           padding='post')
    return train_data, test_data

示例#11

0

显示文件

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from flask import Flask, jsonify, make_response, request
import google.cloud.logging
import logging

client = google.cloud.logging.Client()

client.setup_logging()

app = Flask(__name__)

encoder_dict = imdb.get_word_index(path="imdb_word_index.json")
def encode(sent):
  lst = []
  for i in sent.lower().split():
    if i in encoder_dict.keys():
      if encoder_dict[i]<50000:
        lst.append(encoder_dict[i])
  return lst

dec_d = {v:k for k,v in encoder_dict.items()}
def decode(sent):
  out = ''
  for i in sent:
    if i in dec_d.keys():
      out = out + " " + dec_d[i]
  return out

示例#12

0

显示文件

import subprocess
import requests
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense,Embedding,LSTM,Bidirectional
from tensorflow.keras.optimizers import Adam,SGD,RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb

### load data and preprocess
max_len = 32
trunc_type='post'
padding_type='post'

(train_x,train_y) , (test_x , test_y) = imdb.load_data()
word2idx_dict = imdb.get_word_index()
# idx2word_dict = {idx:word for word,idx in word2idx_dict.items()}
# print(train_x.shape,test_x.shape)
# def idxs2sentence(idxs):
#     return [idx2word_dict[idx] for idx in idxs]
# print(train_x[1])
# print(idxs2sentence(train_x[1]))
vocab_size = len(word2idx_dict)

train_x = pad_sequences(
    train_x
    ,maxlen = max_len
    ,truncating=trunc_type
    ,padding =padding_type
)
test_x = pad_sequences(