Пример #1
0
def reuters_raw(max_features=20000):

    index_offset = 3  # word index offset

    (x_train, y_train), (x_test, y_test) = reuters.load_data(
        num_words=max_features, index_from=index_offset
    )
    x_train = x_train
    y_train = y_train.reshape(-1, 1)
    x_test = x_test
    y_test = y_test.reshape(-1, 1)

    word_to_id = reuters.get_word_index()
    word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2

    id_to_word = {value: key for key, value in word_to_id.items()}
    x_train = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_train)
    )
    x_test = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test)
    )
    x_train = np.array(x_train, dtype=np.str)
    x_test = np.array(x_test, dtype=np.str)
    return (x_train, y_train), (x_test, y_test)
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        # del data_dir
        del tmp_dir
        del dataset_split

        (train_data, _), (test_data,
                          _) = reuters.load_data(num_words=vocab_size,
                                                 seed=1337,
                                                 test_split=0.2)

        word_index = self._get_word_index()
        reverse_word_index = dict([(value, key)
                                   for (key, value) in word_index.items()])

        # for seq in test_data:
        for seq in train_data:
            idx = 0
            while idx + N < len(seq) - 1:
                yield {
                    "inputs":
                    " ".join([
                        reverse_word_index.get(i, '?')
                        for i in seq[idx:idx + N]
                    ]),
                    "targets":
                    str(reverse_word_index.get(seq[idx + N], '?')),
                }
                idx += 1
Пример #3
0
def test_on_reuters():
    max_len = 500
    embedding_dim = 128
    num_words = 10000
    sub_lstms = 5
    (x_train, y_train), (x_test,
                         y_test) = reuters.load_data(num_words=num_words)
    # word2idx = reuters.get_word_index()
    # idx2word = {idx:word for word,idx in word2idx.items()}

    x_train = pad_sequences(x_train, maxlen=max_len)
    x_test = pad_sequences(x_test, maxlen=max_len)
    # print(y_train.max())
    # print(y_train.min())

    inputs = keras.Input((max_len, ))
    x = layers.Embedding(num_words + 1,
                         embedding_dim,
                         mask_zero=True,
                         input_length=max_len)(inputs)
    x = MaskReshape((int(max_len / sub_lstms), int(embedding_dim * sub_lstms)),
                    factor=sub_lstms)(x)
    x = JujubeCake(16, sub_lstms)(x)
    x = layers.Dense(46, activation='softmax')(x)
    model = keras.Model(inputs, x)
    model.summary()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])
    model.fit(x_train,
              y_train,
              epochs=10,
              batch_size=128,
              validation_split=0.1)
    res = model.evaluate(x_test, y_test, batch_size=128)
    print('\n')
    print('#' + '#' * 70 + '#')
    print('#' + ('Evaluation result').center(70, ' ') + '#')
    print('#' + '#' * 70 + '#')
    print('\n')
    for metrics, value in zip(model.metrics_names, res):
        print('%s %.4f' % (metrics, value))

    inputs = keras.Input((max_len, ))
    x = layers.Embedding(num_words + 1,
                         embedding_dim,
                         mask_zero=True,
                         input_length=max_len)(inputs)
    x = layers.LSTM(128, return_sequences=True)(x)
    x = layers.LSTM(64, return_sequences=False)(x)
    x = layers.Dense(46, activation='softmax')(x)
    model = keras.Model(inputs, x)
    model.summary()
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])
    model.fit(x_train,
              y_train,
              epochs=10,
              batch_size=128,
              validation_split=0.1)
    res = model.evaluate(x_test, y_test, batch_size=128)
    print('\n')
    print('#' + '#' * 70 + '#')
    print('#' + ('Evaluation result').center(70, ' ') + '#')
    print('#' + '#' * 70 + '#')
    print('\n')
    for metrics, value in zip(model.metrics_names, res):
        print('%s %.4f' % (metrics, value))
Пример #4
0
#2020-12-04
#embedding + 학습용 예제

from tensorflow.keras.datasets import reuters
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#단어사전의 개수
words = 10000

#1. 데이터
(x_train, y_train), (x_test, y_test) = reuters.load_data(
    num_words=words,  #단어 사전의 개수. 조절 가능
    test_split=0.2)

print(x_train.shape, x_test.shape)  #(8982,) #(2246,)   #8982개의 문장
print(y_train.shape, y_test.shape)  #(8982,) #(2246,) y는 46개의 영역

print(x_train[0])
print(y_train[0])
#tokenizing 되어 있음

print(len(x_train[0]))  #87
print(len(x_train[11]))  #59

#y의 카테고리 개수 출력
category = np.max(y_train) + 1
print("카테고리: ", category)  #카테고리:  46 (softmax)

#신문기사 맞추기
from tensorflow.keras.datasets import reuters
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers

import numpy as np
import matplotlib.pyplot as plt

def vectorize_sequences(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words = 10000)
x_train = vectorize_sequences(train_data)
y_train = vectorize_sequences(test_data)
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

model = models.Sequential()
#Use 64 hidden units to have a higher-dimensional space, needed for 64 different classes
model.add(layers.Dense(64, activation = 'relu', input_shape = (10000, )))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(46, activation = 'softmax'))

model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['acc'])

x_val = x_train[:1000]
partial_x_train = x_train[1000:]
y_val = one_hot_train_labels[:1000]
Пример #6
0
# - Use Keras to fit a predictive model, classifying news articles into topics. 
# - Report your overall score and accuracy
# 
# For reference, the [Keras IMDB sentiment classification example](https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py) will be useful, as well the RNN code we used in class.
# 
# __*Note:*__  Focus on getting a running model, not on maxing accuracy with extreme data size or epoch numbers. Only revisit and push accuracy if you get everything else done!

# In[20]:


from tensorflow.keras.datasets import reuters

(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=723812,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)


# In[21]:


# Demo of encoding

# we got the indices before now we get the word index from reuters word index.json

word_index = reuters.get_word_index(path="reuters_word_index.json")

print(f"Iran is encoded as {word_index['iran']} in the data")
Пример #7
0
"""

import tensorflow as tf
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import numpy as np
import string
import textwrap
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Dense, Dropout, Input, LSTM, GRU, Bidirectional, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
"""Uses the [Reuters newswire](https://keras.io/api/datasets/reuters/) classification dataset, which has text paired with 46 topics as labels. You can see what these labels represent [here](https://martin-thoma.com/nlp-reuters/)."""

(X_train, y_train), (_, _) = reuters.load_data()

# https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
# Needed to encode our own reviews later

word_dict = reuters.get_word_index()
word_dict = {k: (v + 3) for k, v in word_dict.items()}
word_dict["<PAD>"] = 0
word_dict["<START>"] = 1
word_dict["<UNK>"] = 2
word_dict["<UNUSED>"] = 3

vocab_size = len(word_dict.keys())

# Needed to decode training data into readable text
Пример #8
0
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import reuters
from tensorflow.keras import models
from tensorflow.keras import layers

(train_data, train_labels), (test_data,
                             test_labels) = reuters.load_data(num_words=10000)


def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1
    return results


x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)

model = models.Sequential()
#
# Textausgabe im TensorBoard mit tf.summary
#

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import reuters

# Laden des Reuters-Datensatz
INDEX_FROM = 3
START_CHAR = 1
(x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=None,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=START_CHAR,
                                                         oov_char=2,
                                                         index_from=INDEX_FROM)

# Mapping Funktion von id auf Wort
word_index = reuters.get_word_index(path="reuters_word_index.json")
word_index = {k: (v + INDEX_FROM) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = START_CHAR  # 1
word_index["<UNK>"] = 2
id_to_word = {value: key for key, value in word_index.items()}


# Funktion, die uns die Reuters Nachricht als String zurück gibt
Пример #10
0
                        epochs=epochs,
                        batch_size=512,
                        validation_data=(X_val, y_val),
                        verbose=0)

    val_loss = history.history['val_loss']

    return val_loss


if "__main__" == __name__:

    # load data
    np_load_old = np.load
    np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
    (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=10000)
    word_index = reuters.get_word_index(path="reuters_word_index.json")
    np.load = np_load_old

    X_train = vectorize_sequences(X_train)
    X_test = vectorize_sequences(X_test)
    one_hot_y_train = to_categorical(y_train)
    one_hot_y_test = to_categorical(y_test)

    X_val = X_train[:1000]
    partial_X_train = X_train[1000:]
    y_val = one_hot_y_train[:1000]
    partial_y_train = one_hot_y_train[1000:]

    batch_sizes = [200, 400, 600, 800, 1000]
    color = ['b', 'r', 'y', 'g', 'o']
Пример #11
0
"""

# Import libraries
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
np.random.seed(100)
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.text import Tokenizer

# Get data
np_load_old = np.load
np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)
(x, y), (xtest, ytest) = reuters.load_data(num_words=10000)
np.load = np_load_old

# Process text
tokenizer = Tokenizer(num_words=10000)
xtrain = tokenizer.sequences_to_matrix(x, mode='binary')
xtest = tokenizer.sequences_to_matrix(xtest, mode='binary')

ytrain = to_categorical(y)
ytest = to_categorical(ytest)

# Initialize model
model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(ytrain.shape[1], activation='softmax'))
Пример #12
0
 def load_data(self):
     return reuters.load_data(num_words=self.num_words)
Пример #13
0
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Dense, Flatten, Dropout, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 1000
maxlen = 250

#Loading the data from built in reuters dataset in keras
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.25,
                                                         maxlen=maxlen)
#Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
#557056/550378 [==============================] - 0s 1us/step
print("Train_data ", X_train.shape)
print("Train_labels ", y_train.shape)
print("Test_data ", X_test.shape)
print("Test_labels ", y_test.shape)


#*--------------------------------------------------------------------------------------------
def vectorize_sequences(sequences, dimension=max_words):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        #print(i,'+',sequence)
        results[i, sequence] = 1.
Пример #14
0
# import nltk
# nltk.download()
from nltk import word_tokenize
"""
영어 토크나이징 라이브러리
Natural Language Toolkit
"""

sentence = "Natural language processing (NLP) is a subfield of computer science, information engineering, " \
           "and artificial intelligence concerned with the interactions between computers and human (natural) " \
           "languages, in particular how to program computers to process and analyze large amounts of natural " \
           "language data."
# print(word_tokenize(sentence))
# ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', ',',
# 'information', 'engineering', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions',
# 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'in', 'particular', 'how', 'to',
# 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data',
# '.']

from tensorflow.keras.datasets import reuters

(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=None,
                                                         test_split=0.2)
Пример #15
0
    def run_reuters():
        # Extract useful data from dataset
        print('Extracting the Reuters dataset')
        (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

        print(f"There are {len(train_data)} training examples and {len(test_data)} testing examples")

        # Illustration of the input data
        print(
            f'In this dataset the labels denote the topic of the piece. There are 46 topics represented, each one is '
            f'mutually exclusive.\nHaving taken the top 10,000 most-used words no word index will exceed 10,000.\n'
            f'Max Index = {max([max(sequence) for sequence in train_data])}')

        print(
            f"For the sake of illustration, let's decode an article back to English (not being printed for easier reading)")
        word_index = reuters.get_word_index()
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
        # print(decoded_review)

        # Encoding the inputs
        print("In order to pass these lists of integers into a neural network we must first encode them as tensors of "
              "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.")

        def vectorise_sequences(sequences, dimension=10000):
            ret = np.zeros((len(sequences), dimension))
            for i, sequence in enumerate(sequences):
                ret[i, sequence] = 1
                if i < 1:
                    print(f"\n{sequence} => {ret[i]}\n")
            return ret

        x_train = vectorise_sequences(train_data)
        x_test = vectorise_sequences(test_data)

        print("For the labels this time around, there are a few options. A very common option is one-hot-encoding, for "
              "which Keras has an in-built function (a manual version is included in the code for educational purposes)")

        def to_one_hot(labels, dimension=46):
            ret = np.zeros((len(labels), dimension))
            for i, label in enumerate(labels):
                ret[i, label] = 1
            return ret

        one_hot_train_labels = to_categorical(train_labels)
        one_hot_test_labels = to_categorical(test_labels)

        # Design and compile the model
        print("Now to build the network, this time using parameters with greater configurability")
        model = models.Sequential()
        model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(46, activation='softmax'))

        model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='categorical_crossentropy',
                      metrics=[metrics.categorical_accuracy])

        # Divide the training data
        print("Creating a validation set for greater insight during training")
        x_val = x_train[:1000]  # Taking the 1st 1000 samples for validation
        partial_x_train = x_train[1000:]  # Leaving everything from 1000 onwards for training
        y_val = one_hot_train_labels[:1000]  # Taking the 1st 1000 labels for validation
        partial_y_train = one_hot_train_labels[1000:]  # Leaving everything from 1000 onwards for training

        # Train the model
        print("Begin training the model:")
        history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
        history_dict = history.history

        print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. "
              f"The keys are: {history_dict.keys()}")  # ['loss', 'categorical_accuracy', 'val_loss', 'val_categorical_accuracy']

        # Prepare to plot the training and validation information
        loss_values = history_dict['loss']
        val_loss_values = history_dict['val_loss']
        acc_values = history_dict['categorical_accuracy']
        val_acc_values = history_dict['val_categorical_accuracy']

        epochs = range(1, len(history_dict['categorical_accuracy']) + 1)
        plt.plot(epochs, loss_values, 'bo', label='Training Loss')
        plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

        plt.clf()
        plt.plot(epochs, acc_values, 'bo', label='Training Accuracy')
        plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy')
        plt.title('Training and Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

        # Evaluate the model
        print("\nAfter reviewing each plot, evaluate the performance of the model on new data")
        results = model.evaluate(x_test, one_hot_test_labels)
        print(f"Evaluation Results: Loss = {results[0]}    Accuracy = {results[1] * 100}%")
Пример #16
0
import tensorflow.keras.datasets.reuters as reuters
from corpus_preparation import *

(x_train, y_train), (x_test, y_test) = reuters.load_data(
    path="op_lab_reuters_dataset.npz",
    num_words=None,
    skip_top=0,
    maxlen=None,
    test_split=0.2,
    seed=2143,
    start_char=1,  # The start of a sequence will be marked with this character.
    oov_char=2,  # The out-of-vocabulary character.
    index_from=3,  # Index actual words with this index and higher.
)
reuters.get_word_index(path="reuters_word_index.json")
corpus = generate_corpus(x_train[:3000],
                         k=100,
                         normalized=False,
                         filename='example_corpus.csv')
print("corpus created")
Пример #17
0
# 단어 input_dim = 10000, maxlen 자르는 방법, 임베딩사용해서 모델링
from tensorflow.keras.datasets import reuters
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten, Conv1D
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = reuters.load_data(
    num_words = 5000, test_split=0.2
) # num_words = 10000, 10000번째 안에 있는 것을 가져온다


print(x_train[0], type(x_train[0]))
print(y_train[0])
print(len(x_train[0]), len(x_train[11])) # 87 59
print('====================================')
print(x_train.shape, x_test.shape) # (8982,) (2246,)
print(y_train.shape, y_test.shape) #(8982,) (2246,)

print('뉴스기사 최대길이 : ', max(len(l) for l in x_train)) # 뉴스기사 최대길이 :  2376
print('뉴스기사 평균길이 : ', sum(map(len, x_train))/ len(x_train)) # 뉴스기사 평균길이 :  145.5398574927633

# plt.hist([len(s) for s in x_train], bins = 40)
# plt.show() # x가 데이터 길이
Пример #18
0
def load_data_set(type, max_len, vocab_size, batch_size):
    """
	Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification

	Args:
	type   : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set
	max_len: {int} timesteps used for padding
	vocab_size: {int} size of the vocabulary
	batch_size: batch_size
	Returns:
	train_loader: {torch.Dataloader} train dataloader
	x_test_pad  : padded tokenized test_data for cross validating
	y_test      : y_test
	word_to_id  : {dict} words mapped to indices


	"""
    INDEX_FROM = 2
    if not bool(type):

        NUM_WORDS = vocab_size  # only use top 1000 words
        dataset = pd.read_csv('df_raw_text2.csv')

        dataset = dataset[~dataset.TEXTOS.isnull()]
        dataset.drop_duplicates(subset="DOCS_ID", keep='first', inplace=True)
        dataset = dataset[dataset.V_AMB != 's/d']
        dataset.drop_duplicates(subset="TEXTOS", keep='first', inplace=True)

        texts1 = [' '.join(txt.splitlines())
                  for txt in dataset['TEXTOS']]  # remove /n
        texts2 = [preprocess(txt) for txt in texts1
                  ]  # preprocess. ie. remove punct., lowercase, etc.

        t = Tokenizer(num_words=10000,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True,
                      split=" ",
                      char_level=False,
                      oov_token=1)

        t.fit_on_texts(texts2)
        x_test_seq = t.texts_to_sequences(texts2)

        word_to_id = {k: v for k, v in t.word_index.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<UNK>"] = 1
        id_to_word = {value: key for key, value in word_to_id.items()}

        y = [1 if elem == 'si' else 0 for elem in dataset.VIOLENCIA_DE_GENERO]

        x = np.array(x_test_seq)
        y = np.array(y)

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=.75,
                                                            random_state=1)

        x_train_pad = pad_sequences(x_train, maxlen=max_len)
        x_test_pad = pad_sequences(x_test, maxlen=max_len)

        train_data = data_utils.TensorDataset(
            torch.from_numpy(x_train_pad).type(torch.LongTensor),
            torch.from_numpy(y_train).type(torch.DoubleTensor))
        train_loader = data_utils.DataLoader(train_data,
                                             batch_size=batch_size,
                                             drop_last=True)
        return train_loader, x_test_pad, y_test, word_to_id

    else:
        from tensorflow.keras.datasets import reuters

        train_set, test_set = reuters.load_data(path="reuters.npz",
                                                num_words=vocab_size,
                                                skip_top=0,
                                                index_from=INDEX_FROM)
        x_train, y_train = train_set[0], train_set[1]
        x_test, y_test = test_set[0], test_set[1]
        word_to_id = reuters.get_word_index(path="reuters_word_index.json")
        word_to_id = {k: (v + 3) for k, v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        word_to_id['<EOS>'] = 3
        id_to_word = {value: key for key, value in word_to_id.items()}
        x_train_pad = pad_sequences(x_train, maxlen=max_len)
        x_test_pad = pad_sequences(x_test, maxlen=max_len)

        train_data = data_utils.TensorDataset(
            torch.from_numpy(x_train_pad).type(torch.LongTensor),
            torch.from_numpy(y_train).type(torch.LongTensor))
        train_loader = data_utils.DataLoader(train_data,
                                             batch_size=batch_size,
                                             drop_last=True)
        return train_loader, train_set, test_set, x_test_pad, word_to_id
Пример #19
0
'''  텍스트 다중 분류 - 로이터 뉴스 토픽 분류
    46가지 토픽으로 라벨이 달린 11,228개의 로이터 뉴스로 이루어진 데이터셋. IMDB 데이터셋과 마찬가지로, 각 뉴스는 (같은 방식을 사용한) 단어 인덱스의 시퀀스로 인코딩되어 있습니다.
'''
from tensorflow.keras.datasets import reuters
(train_feat, train_label), (test_feat, test_label) = reuters.load_data(num_words=10000) # 자주 등장하는 단어 10000개 사용
print(train_feat[:3])
print(train_label[:3])
print(train_feat.shape, train_label.shape, test_feat.shape) # (8982,) (8982,) (2246,)
print()

# 리스트로 된 로이터 데이터의 실제 값 보기 #
word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decode_re = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_feat[0]])
print(decode_re, '\n\n')
# =================== #

# feature : list => vector
import numpy as np
def vector_seq(sequence, dim = 10000):
    results = np.zeros((len(sequence), dim))  # 0행렬 생성
    for i, seq in enumerate(sequence):
        results[i, seq] = 1.
    return results
x_train = vector_seq(train_feat)  # train_feat를 벡터화
x_test = vector_seq(test_feat)   # test_feat를 벡터화
print(x_train, x_train.shape)
print(x_test, x_test.shape)


# # self one hot encoding
Пример #20
0
# Set global constants
vocabulary_size = 10000  # choose 20k most-used words for truncated vocabulary
sequence_length = 1000  # choose 1000-word sequences
embedding_dims = 50  # number of dimensions to represent each word in vector space
batch_size = 32  # feed in the neural network in 100-example training batches
num_epochs = 30  # number of times the neural network goes over EACH training example
config = int(args.config)  # model configuration

# Setup np.load() with allow_pickle=True before loading data as described in:
# https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded
np_load_old = np.load
np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)

# Load the Reuters news dataset for document classification
(X_train, Y_train), (X_test,
                     Y_test) = reuters.load_data(num_words=vocabulary_size)

# Set more global constants
num_categories = max(Y_train) + 1

# Restore np.load() for future normal usage
np.load = np_load_old

# Pad sequences to maximum found sequence length
X_train = pad_sequences(sequences=X_train, maxlen=sequence_length)
X_test = pad_sequences(sequences=X_test, maxlen=sequence_length)

# Compute batch size and cutoff training & validation examples to fit
training_cutoff, test_cutoff = len(X_train) % batch_size, len(
    X_test) % batch_size
X_train, Y_train = X_train[:-training_cutoff], Y_train[:-training_cutoff]
def save_reuters():
    OUT_DIR = 'reuters'

    # Load data from keras API
    (x_train, y_train), (x_test, y_test) = reuters.load_data()

    # get word index
    word_index = reuters.get_word_index()
    # make dictionary to reference index
    word_list = {(value + 3): key for key, value in word_index.items()}
    INVALID_STR = '#$%'
    # define invalid string to remove them later
    word_list[0] = INVALID_STR
    word_list[1] = INVALID_STR
    word_list[2] = INVALID_STR

    # define class names from ex: https://github.com/keras-team/keras/issues/12072
    class_list = [
        'cocoa', 'grain', 'veg-oil', 'earn', 'acq', 'wheat', 'copper',
        'housing', 'money-supply', 'coffee', 'sugar', 'trade', 'reserves',
        'ship', 'cotton', 'carcass', 'crude', 'nat-gas', 'cpi', 'money-fx',
        'interest', 'gnp', 'meal-feed', 'alum', 'oilseed', 'gold', 'tin',
        'strategic-metal', 'livestock', 'retail', 'ipi', 'iron-steel',
        'rubber', 'heat', 'jobs', 'lei', 'bop', 'zinc', 'orange', 'pet-chem',
        'dlr', 'gas', 'silver', 'wpi', 'hog', 'lead'
    ]

    # make train/test dirs and class dirs
    for cid, class_name in enumerate(class_list):
        os.makedirs(os.path.join(OUT_DIR, 'train',
                                 '{:02d}_{}'.format(cid, class_name)),
                    exist_ok=True)
        os.makedirs(os.path.join(OUT_DIR, 'test',
                                 '{:02d}_{}'.format(cid, class_name)),
                    exist_ok=True)

    # convert train data
    for num, (x_data, y_data) in enumerate(zip(x_train, y_train)):
        # make file path
        fpath = os.path.join(OUT_DIR, 'train',
                             '{:02d}_{}'.format(y_data, class_list[y_data]),
                             'train_{:05d}.txt'.format(num))
        with open(fpath, mode='w', encoding='utf-8') as f:
            # convert indices and join words with space
            word_org = ' '.join(word_list[inx] for inx in x_data)
            # remove invalid strings
            word_org = word_org.replace(INVALID_STR + ' ', '')
            # save text
            f.write(word_org)

    # convert test data
    for num, (x_data, y_data) in enumerate(zip(x_test, y_test)):
        # make file path
        fpath = os.path.join(OUT_DIR, 'test',
                             '{:02d}_{}'.format(y_data, class_list[y_data]),
                             'test_{:05d}.txt'.format(num))
        with open(fpath, mode='w', encoding='utf-8') as f:
            # convert indices and join words with space
            word_org = ' '.join(word_list[inx] for inx in x_data)
            # remove invalid strings
            word_org = word_org.replace(INVALID_STR + ' ', '')
            # save text
            f.write(word_org)

    print()
    print('Saved to ' + OUT_DIR + '/')
    print()
Пример #22
0
import numpy as np
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense
from tensorflow.keras.utils import to_categorical

seed = 10
np.random.seed(seed)  # 指定亂數種子
# 載入Reuters資料集
top_words = 10000
(X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=top_words)
# 資料預處理
max_words = 200
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
# One-hot編碼
Y_train = to_categorical(Y_train, 46)
Y_test = to_categorical(Y_test, 46)
# 定義模型
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.75))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32))
model.add(Dropout(0.5))
model.add(Dense(46, activation="softmax"))
model.summary()  # 顯示模型摘要資訊
# 編譯模型
model.compile(loss="categorical_crossentropy",
              optimizer="rmsprop",
Пример #23
0
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)  # 위치비교 / 패키지별로 계산 방법이 달라서 값이 다른 것
"""# TF-IDF `TfidfVectorizer` on reuter's dataset
로이터 데이터로 TF-IDF 학습\
https://wikidocs.net/22933
"""

from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# reuters dataset loading 10000 words with 80% Train set, 20% test set
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000,
                                                         test_split=0.2)

print('훈련 샘플의 수 : {}'.format(len(x_train)))

print('테스트 샘플의 수 : {}'.format(len(x_test)))

# display first data set of train and test
print(x_train[0])
print(x_test[0])

# display first target class of train
print(y_train[0])

# display first target class of test
print(y_test[0])
Пример #24
0
https://arxiv.org/pdf/1310.4546.pdf

Built on Reuters dataset of articles
"""
import random
from tqdm import tqdm

from tensorflow.keras.datasets import reuters
import numpy as np

# SECTION 1
# Get corpus
vocab_size=10000

(train, _), (test, _) = reuters.load_data(path="reuters.npz",
                                                         num_words=vocab_size,
                                                         test_split=0.2,
                                                         seed=1337)

# A dictionary mapping words to an integer index
word_index = reuters.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])