def reuters_raw(max_features=20000): index_offset = 3 # word index offset (x_train, y_train), (x_test, y_test) = reuters.load_data( num_words=max_features, index_from=index_offset ) x_train = x_train y_train = y_train.reshape(-1, 1) x_test = x_test y_test = y_test.reshape(-1, 1) word_to_id = reuters.get_word_index() word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} x_train = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_train) ) x_test = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test) ) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) return (x_train, y_train), (x_test, y_test)
def generate_samples(self, data_dir, tmp_dir, dataset_split): # del data_dir del tmp_dir del dataset_split (train_data, _), (test_data, _) = reuters.load_data(num_words=vocab_size, seed=1337, test_split=0.2) word_index = self._get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # for seq in test_data: for seq in train_data: idx = 0 while idx + N < len(seq) - 1: yield { "inputs": " ".join([ reverse_word_index.get(i, '?') for i in seq[idx:idx + N] ]), "targets": str(reverse_word_index.get(seq[idx + N], '?')), } idx += 1
def test_on_reuters(): max_len = 500 embedding_dim = 128 num_words = 10000 sub_lstms = 5 (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words) # word2idx = reuters.get_word_index() # idx2word = {idx:word for word,idx in word2idx.items()} x_train = pad_sequences(x_train, maxlen=max_len) x_test = pad_sequences(x_test, maxlen=max_len) # print(y_train.max()) # print(y_train.min()) inputs = keras.Input((max_len, )) x = layers.Embedding(num_words + 1, embedding_dim, mask_zero=True, input_length=max_len)(inputs) x = MaskReshape((int(max_len / sub_lstms), int(embedding_dim * sub_lstms)), factor=sub_lstms)(x) x = JujubeCake(16, sub_lstms)(x) x = layers.Dense(46, activation='softmax')(x) model = keras.Model(inputs, x) model.summary() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1) res = model.evaluate(x_test, y_test, batch_size=128) print('\n') print('#' + '#' * 70 + '#') print('#' + ('Evaluation result').center(70, ' ') + '#') print('#' + '#' * 70 + '#') print('\n') for metrics, value in zip(model.metrics_names, res): print('%s %.4f' % (metrics, value)) inputs = keras.Input((max_len, )) x = layers.Embedding(num_words + 1, embedding_dim, mask_zero=True, input_length=max_len)(inputs) x = layers.LSTM(128, return_sequences=True)(x) x = layers.LSTM(64, return_sequences=False)(x) x = layers.Dense(46, activation='softmax')(x) model = keras.Model(inputs, x) model.summary() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc']) model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.1) res = model.evaluate(x_test, y_test, batch_size=128) print('\n') print('#' + '#' * 70 + '#') print('#' + ('Evaluation result').center(70, ' ') + '#') print('#' + '#' * 70 + '#') print('\n') for metrics, value in zip(model.metrics_names, res): print('%s %.4f' % (metrics, value))
#2020-12-04 #embedding + 학습용 예제 from tensorflow.keras.datasets import reuters import numpy as np import pandas as pd import matplotlib.pyplot as plt #단어사전의 개수 words = 10000 #1. 데이터 (x_train, y_train), (x_test, y_test) = reuters.load_data( num_words=words, #단어 사전의 개수. 조절 가능 test_split=0.2) print(x_train.shape, x_test.shape) #(8982,) #(2246,) #8982개의 문장 print(y_train.shape, y_test.shape) #(8982,) #(2246,) y는 46개의 영역 print(x_train[0]) print(y_train[0]) #tokenizing 되어 있음 print(len(x_train[0])) #87 print(len(x_train[11])) #59 #y의 카테고리 개수 출력 category = np.max(y_train) + 1 print("카테고리: ", category) #카테고리: 46 (softmax) #신문기사 맞추기
from tensorflow.keras.datasets import reuters from tensorflow.keras.utils import to_categorical from tensorflow.keras import models from tensorflow.keras import layers import numpy as np import matplotlib.pyplot as plt def vectorize_sequences(sequences, dimension = 10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words = 10000) x_train = vectorize_sequences(train_data) y_train = vectorize_sequences(test_data) one_hot_train_labels = to_categorical(train_labels) one_hot_test_labels = to_categorical(test_labels) model = models.Sequential() #Use 64 hidden units to have a higher-dimensional space, needed for 64 different classes model.add(layers.Dense(64, activation = 'relu', input_shape = (10000, ))) model.add(layers.Dense(64, activation = 'relu')) model.add(layers.Dense(46, activation = 'softmax')) model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['acc']) x_val = x_train[:1000] partial_x_train = x_train[1000:] y_val = one_hot_train_labels[:1000]
# - Use Keras to fit a predictive model, classifying news articles into topics. # - Report your overall score and accuracy # # For reference, the [Keras IMDB sentiment classification example](https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py) will be useful, as well the RNN code we used in class. # # __*Note:*__ Focus on getting a running model, not on maxing accuracy with extreme data size or epoch numbers. Only revisit and push accuracy if you get everything else done! # In[20]: from tensorflow.keras.datasets import reuters (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=723812, start_char=1, oov_char=2, index_from=3) # In[21]: # Demo of encoding # we got the indices before now we get the word index from reuters word index.json word_index = reuters.get_word_index(path="reuters_word_index.json") print(f"Iran is encoded as {word_index['iran']} in the data")
""" import tensorflow as tf from tensorflow.keras.datasets import reuters import matplotlib.pyplot as plt import numpy as np import string import textwrap from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras.layers import Embedding, Dense, Dropout, Input, LSTM, GRU, Bidirectional, Flatten from tensorflow.keras.models import Model from tensorflow.keras.regularizers import l2 """Uses the [Reuters newswire](https://keras.io/api/datasets/reuters/) classification dataset, which has text paired with 46 topics as labels. You can see what these labels represent [here](https://martin-thoma.com/nlp-reuters/).""" (X_train, y_train), (_, _) = reuters.load_data() # https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset # Needed to encode our own reviews later word_dict = reuters.get_word_index() word_dict = {k: (v + 3) for k, v in word_dict.items()} word_dict["<PAD>"] = 0 word_dict["<START>"] = 1 word_dict["<UNK>"] = 2 word_dict["<UNUSED>"] = 3 vocab_size = len(word_dict.keys()) # Needed to decode training data into readable text
import numpy as np import matplotlib.pyplot as plt from tensorflow.keras.datasets import reuters from tensorflow.keras import models from tensorflow.keras import layers (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results def to_one_hot(labels, dimension=46): results = np.zeros((len(labels), dimension)) for i, label in enumerate(labels): results[i, label] = 1 return results x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) one_hot_train_labels = to_one_hot(train_labels) one_hot_test_labels = to_one_hot(test_labels) model = models.Sequential()
# # Textausgabe im TensorBoard mit tf.summary # import tensorflow as tf from tensorflow import keras from tensorflow.keras.datasets import reuters # Laden des Reuters-Datensatz INDEX_FROM = 3 START_CHAR = 1 (x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz", num_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=113, start_char=START_CHAR, oov_char=2, index_from=INDEX_FROM) # Mapping Funktion von id auf Wort word_index = reuters.get_word_index(path="reuters_word_index.json") word_index = {k: (v + INDEX_FROM) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = START_CHAR # 1 word_index["<UNK>"] = 2 id_to_word = {value: key for key, value in word_index.items()} # Funktion, die uns die Reuters Nachricht als String zurück gibt
epochs=epochs, batch_size=512, validation_data=(X_val, y_val), verbose=0) val_loss = history.history['val_loss'] return val_loss if "__main__" == __name__: # load data np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=10000) word_index = reuters.get_word_index(path="reuters_word_index.json") np.load = np_load_old X_train = vectorize_sequences(X_train) X_test = vectorize_sequences(X_test) one_hot_y_train = to_categorical(y_train) one_hot_y_test = to_categorical(y_test) X_val = X_train[:1000] partial_X_train = X_train[1000:] y_val = one_hot_y_train[:1000] partial_y_train = one_hot_y_train[1000:] batch_sizes = [200, 400, 600, 800, 1000] color = ['b', 'r', 'y', 'g', 'o']
""" # Import libraries import tensorflow as tf import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.utils import to_categorical np.random.seed(100) from tensorflow.keras.datasets import reuters from tensorflow.keras.preprocessing.text import Tokenizer # Get data np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) (x, y), (xtest, ytest) = reuters.load_data(num_words=10000) np.load = np_load_old # Process text tokenizer = Tokenizer(num_words=10000) xtrain = tokenizer.sequences_to_matrix(x, mode='binary') xtest = tokenizer.sequences_to_matrix(xtest, mode='binary') ytrain = to_categorical(y) ytest = to_categorical(ytest) # Initialize model model = Sequential() model.add(Dense(512, activation='relu')) model.add(Dropout(.5)) model.add(Dense(ytrain.shape[1], activation='softmax'))
def load_data(self): return reuters.load_data(num_words=self.num_words)
from sklearn.preprocessing import MinMaxScaler from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.utils import to_categorical from tensorflow.keras import models, layers, regularizers from tensorflow.keras.models import Sequential from tensorflow.keras.callbacks import EarlyStopping from sklearn.metrics import classification_report, confusion_matrix from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Dense, Flatten, Dropout, Embedding from tensorflow.keras.preprocessing.sequence import pad_sequences max_words = 1000 maxlen = 250 #Loading the data from built in reuters dataset in keras (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.25, maxlen=maxlen) #Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json #557056/550378 [==============================] - 0s 1us/step print("Train_data ", X_train.shape) print("Train_labels ", y_train.shape) print("Test_data ", X_test.shape) print("Test_labels ", y_test.shape) #*-------------------------------------------------------------------------------------------- def vectorize_sequences(sequences, dimension=max_words): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): #print(i,'+',sequence) results[i, sequence] = 1.
# import nltk # nltk.download() from nltk import word_tokenize """ 영어 토크나이징 라이브러리 Natural Language Toolkit """ sentence = "Natural language processing (NLP) is a subfield of computer science, information engineering, " \ "and artificial intelligence concerned with the interactions between computers and human (natural) " \ "languages, in particular how to program computers to process and analyze large amounts of natural " \ "language data." # print(word_tokenize(sentence)) # ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', ',', # 'information', 'engineering', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', # 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'in', 'particular', 'how', 'to', # 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', # '.'] from tensorflow.keras.datasets import reuters (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
def run_reuters(): # Extract useful data from dataset print('Extracting the Reuters dataset') (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) print(f"There are {len(train_data)} training examples and {len(test_data)} testing examples") # Illustration of the input data print( f'In this dataset the labels denote the topic of the piece. There are 46 topics represented, each one is ' f'mutually exclusive.\nHaving taken the top 10,000 most-used words no word index will exceed 10,000.\n' f'Max Index = {max([max(sequence) for sequence in train_data])}') print( f"For the sake of illustration, let's decode an article back to English (not being printed for easier reading)") word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # print(decoded_review) # Encoding the inputs print("In order to pass these lists of integers into a neural network we must first encode them as tensors of " "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.") def vectorise_sequences(sequences, dimension=10000): ret = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): ret[i, sequence] = 1 if i < 1: print(f"\n{sequence} => {ret[i]}\n") return ret x_train = vectorise_sequences(train_data) x_test = vectorise_sequences(test_data) print("For the labels this time around, there are a few options. A very common option is one-hot-encoding, for " "which Keras has an in-built function (a manual version is included in the code for educational purposes)") def to_one_hot(labels, dimension=46): ret = np.zeros((len(labels), dimension)) for i, label in enumerate(labels): ret[i, label] = 1 return ret one_hot_train_labels = to_categorical(train_labels) one_hot_test_labels = to_categorical(test_labels) # Design and compile the model print("Now to build the network, this time using parameters with greater configurability") model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(10000,))) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(46, activation='softmax')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=[metrics.categorical_accuracy]) # Divide the training data print("Creating a validation set for greater insight during training") x_val = x_train[:1000] # Taking the 1st 1000 samples for validation partial_x_train = x_train[1000:] # Leaving everything from 1000 onwards for training y_val = one_hot_train_labels[:1000] # Taking the 1st 1000 labels for validation partial_y_train = one_hot_train_labels[1000:] # Leaving everything from 1000 onwards for training # Train the model print("Begin training the model:") history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) history_dict = history.history print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. " f"The keys are: {history_dict.keys()}") # ['loss', 'categorical_accuracy', 'val_loss', 'val_categorical_accuracy'] # Prepare to plot the training and validation information loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] acc_values = history_dict['categorical_accuracy'] val_acc_values = history_dict['val_categorical_accuracy'] epochs = range(1, len(history_dict['categorical_accuracy']) + 1) plt.plot(epochs, loss_values, 'bo', label='Training Loss') plt.plot(epochs, val_loss_values, 'b', label='Validation Loss') plt.title('Training and Validation Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() plt.plot(epochs, acc_values, 'bo', label='Training Accuracy') plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy') plt.title('Training and Validation Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # Evaluate the model print("\nAfter reviewing each plot, evaluate the performance of the model on new data") results = model.evaluate(x_test, one_hot_test_labels) print(f"Evaluation Results: Loss = {results[0]} Accuracy = {results[1] * 100}%")
import tensorflow.keras.datasets.reuters as reuters from corpus_preparation import * (x_train, y_train), (x_test, y_test) = reuters.load_data( path="op_lab_reuters_dataset.npz", num_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=2143, start_char=1, # The start of a sequence will be marked with this character. oov_char=2, # The out-of-vocabulary character. index_from=3, # Index actual words with this index and higher. ) reuters.get_word_index(path="reuters_word_index.json") corpus = generate_corpus(x_train[:3000], k=100, normalized=False, filename='example_corpus.csv') print("corpus created")
# 단어 input_dim = 10000, maxlen 자르는 방법, 임베딩사용해서 모델링 from tensorflow.keras.datasets import reuters from tensorflow.keras.layers import Dense, LSTM, Embedding from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint from tensorflow.keras.models import load_model from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten, Conv1D import numpy as np import pandas as pd import matplotlib.pyplot as plt (x_train, y_train), (x_test, y_test) = reuters.load_data( num_words = 5000, test_split=0.2 ) # num_words = 10000, 10000번째 안에 있는 것을 가져온다 print(x_train[0], type(x_train[0])) print(y_train[0]) print(len(x_train[0]), len(x_train[11])) # 87 59 print('====================================') print(x_train.shape, x_test.shape) # (8982,) (2246,) print(y_train.shape, y_test.shape) #(8982,) (2246,) print('뉴스기사 최대길이 : ', max(len(l) for l in x_train)) # 뉴스기사 최대길이 : 2376 print('뉴스기사 평균길이 : ', sum(map(len, x_train))/ len(x_train)) # 뉴스기사 평균길이 : 145.5398574927633 # plt.hist([len(s) for s in x_train], bins = 40) # plt.show() # x가 데이터 길이
def load_data_set(type, max_len, vocab_size, batch_size): """ Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification Args: type : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set max_len: {int} timesteps used for padding vocab_size: {int} size of the vocabulary batch_size: batch_size Returns: train_loader: {torch.Dataloader} train dataloader x_test_pad : padded tokenized test_data for cross validating y_test : y_test word_to_id : {dict} words mapped to indices """ INDEX_FROM = 2 if not bool(type): NUM_WORDS = vocab_size # only use top 1000 words dataset = pd.read_csv('df_raw_text2.csv') dataset = dataset[~dataset.TEXTOS.isnull()] dataset.drop_duplicates(subset="DOCS_ID", keep='first', inplace=True) dataset = dataset[dataset.V_AMB != 's/d'] dataset.drop_duplicates(subset="TEXTOS", keep='first', inplace=True) texts1 = [' '.join(txt.splitlines()) for txt in dataset['TEXTOS']] # remove /n texts2 = [preprocess(txt) for txt in texts1 ] # preprocess. ie. remove punct., lowercase, etc. t = Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ", char_level=False, oov_token=1) t.fit_on_texts(texts2) x_test_seq = t.texts_to_sequences(texts2) word_to_id = {k: v for k, v in t.word_index.items()} word_to_id["<PAD>"] = 0 word_to_id["<UNK>"] = 1 id_to_word = {value: key for key, value in word_to_id.items()} y = [1 if elem == 'si' else 0 for elem in dataset.VIOLENCIA_DE_GENERO] x = np.array(x_test_seq) y = np.array(y) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.75, random_state=1) x_train_pad = pad_sequences(x_train, maxlen=max_len) x_test_pad = pad_sequences(x_test, maxlen=max_len) train_data = data_utils.TensorDataset( torch.from_numpy(x_train_pad).type(torch.LongTensor), torch.from_numpy(y_train).type(torch.DoubleTensor)) train_loader = data_utils.DataLoader(train_data, batch_size=batch_size, drop_last=True) return train_loader, x_test_pad, y_test, word_to_id else: from tensorflow.keras.datasets import reuters train_set, test_set = reuters.load_data(path="reuters.npz", num_words=vocab_size, skip_top=0, index_from=INDEX_FROM) x_train, y_train = train_set[0], train_set[1] x_test, y_test = test_set[0], test_set[1] word_to_id = reuters.get_word_index(path="reuters_word_index.json") word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id['<EOS>'] = 3 id_to_word = {value: key for key, value in word_to_id.items()} x_train_pad = pad_sequences(x_train, maxlen=max_len) x_test_pad = pad_sequences(x_test, maxlen=max_len) train_data = data_utils.TensorDataset( torch.from_numpy(x_train_pad).type(torch.LongTensor), torch.from_numpy(y_train).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data, batch_size=batch_size, drop_last=True) return train_loader, train_set, test_set, x_test_pad, word_to_id
''' 텍스트 다중 분류 - 로이터 뉴스 토픽 분류 46가지 토픽으로 라벨이 달린 11,228개의 로이터 뉴스로 이루어진 데이터셋. IMDB 데이터셋과 마찬가지로, 각 뉴스는 (같은 방식을 사용한) 단어 인덱스의 시퀀스로 인코딩되어 있습니다. ''' from tensorflow.keras.datasets import reuters (train_feat, train_label), (test_feat, test_label) = reuters.load_data(num_words=10000) # 자주 등장하는 단어 10000개 사용 print(train_feat[:3]) print(train_label[:3]) print(train_feat.shape, train_label.shape, test_feat.shape) # (8982,) (8982,) (2246,) print() # 리스트로 된 로이터 데이터의 실제 값 보기 # word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decode_re = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_feat[0]]) print(decode_re, '\n\n') # =================== # # feature : list => vector import numpy as np def vector_seq(sequence, dim = 10000): results = np.zeros((len(sequence), dim)) # 0행렬 생성 for i, seq in enumerate(sequence): results[i, seq] = 1. return results x_train = vector_seq(train_feat) # train_feat를 벡터화 x_test = vector_seq(test_feat) # test_feat를 벡터화 print(x_train, x_train.shape) print(x_test, x_test.shape) # # self one hot encoding
# Set global constants vocabulary_size = 10000 # choose 20k most-used words for truncated vocabulary sequence_length = 1000 # choose 1000-word sequences embedding_dims = 50 # number of dimensions to represent each word in vector space batch_size = 32 # feed in the neural network in 100-example training batches num_epochs = 30 # number of times the neural network goes over EACH training example config = int(args.config) # model configuration # Setup np.load() with allow_pickle=True before loading data as described in: # https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # Load the Reuters news dataset for document classification (X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=vocabulary_size) # Set more global constants num_categories = max(Y_train) + 1 # Restore np.load() for future normal usage np.load = np_load_old # Pad sequences to maximum found sequence length X_train = pad_sequences(sequences=X_train, maxlen=sequence_length) X_test = pad_sequences(sequences=X_test, maxlen=sequence_length) # Compute batch size and cutoff training & validation examples to fit training_cutoff, test_cutoff = len(X_train) % batch_size, len( X_test) % batch_size X_train, Y_train = X_train[:-training_cutoff], Y_train[:-training_cutoff]
def save_reuters(): OUT_DIR = 'reuters' # Load data from keras API (x_train, y_train), (x_test, y_test) = reuters.load_data() # get word index word_index = reuters.get_word_index() # make dictionary to reference index word_list = {(value + 3): key for key, value in word_index.items()} INVALID_STR = '#$%' # define invalid string to remove them later word_list[0] = INVALID_STR word_list[1] = INVALID_STR word_list[2] = INVALID_STR # define class names from ex: https://github.com/keras-team/keras/issues/12072 class_list = [ 'cocoa', 'grain', 'veg-oil', 'earn', 'acq', 'wheat', 'copper', 'housing', 'money-supply', 'coffee', 'sugar', 'trade', 'reserves', 'ship', 'cotton', 'carcass', 'crude', 'nat-gas', 'cpi', 'money-fx', 'interest', 'gnp', 'meal-feed', 'alum', 'oilseed', 'gold', 'tin', 'strategic-metal', 'livestock', 'retail', 'ipi', 'iron-steel', 'rubber', 'heat', 'jobs', 'lei', 'bop', 'zinc', 'orange', 'pet-chem', 'dlr', 'gas', 'silver', 'wpi', 'hog', 'lead' ] # make train/test dirs and class dirs for cid, class_name in enumerate(class_list): os.makedirs(os.path.join(OUT_DIR, 'train', '{:02d}_{}'.format(cid, class_name)), exist_ok=True) os.makedirs(os.path.join(OUT_DIR, 'test', '{:02d}_{}'.format(cid, class_name)), exist_ok=True) # convert train data for num, (x_data, y_data) in enumerate(zip(x_train, y_train)): # make file path fpath = os.path.join(OUT_DIR, 'train', '{:02d}_{}'.format(y_data, class_list[y_data]), 'train_{:05d}.txt'.format(num)) with open(fpath, mode='w', encoding='utf-8') as f: # convert indices and join words with space word_org = ' '.join(word_list[inx] for inx in x_data) # remove invalid strings word_org = word_org.replace(INVALID_STR + ' ', '') # save text f.write(word_org) # convert test data for num, (x_data, y_data) in enumerate(zip(x_test, y_test)): # make file path fpath = os.path.join(OUT_DIR, 'test', '{:02d}_{}'.format(y_data, class_list[y_data]), 'test_{:05d}.txt'.format(num)) with open(fpath, mode='w', encoding='utf-8') as f: # convert indices and join words with space word_org = ' '.join(word_list[inx] for inx in x_data) # remove invalid strings word_org = word_org.replace(INVALID_STR + ' ', '') # save text f.write(word_org) print() print('Saved to ' + OUT_DIR + '/') print()
import numpy as np from tensorflow.keras.datasets import reuters from tensorflow.keras.preprocessing import sequence from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense from tensorflow.keras.utils import to_categorical seed = 10 np.random.seed(seed) # 指定亂數種子 # 載入Reuters資料集 top_words = 10000 (X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=top_words) # 資料預處理 max_words = 200 X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) # One-hot編碼 Y_train = to_categorical(Y_train, 46) Y_test = to_categorical(Y_test, 46) # 定義模型 model = Sequential() model.add(Embedding(top_words, 32, input_length=max_words)) model.add(Dropout(0.75)) model.add(LSTM(32, return_sequences=True)) model.add(LSTM(32)) model.add(Dropout(0.5)) model.add(Dense(46, activation="softmax")) model.summary() # 顯示模型摘要資訊 # 編譯模型 model.compile(loss="categorical_crossentropy", optimizer="rmsprop",
print(tfidfv.transform(corpus).toarray()) print(tfidfv.vocabulary_) # 위치비교 / 패키지별로 계산 방법이 달라서 값이 다른 것 """# TF-IDF `TfidfVectorizer` on reuter's dataset 로이터 데이터로 TF-IDF 학습\ https://wikidocs.net/22933 """ from tensorflow.keras.datasets import reuters import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd # reuters dataset loading 10000 words with 80% Train set, 20% test set (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2) print('훈련 샘플의 수 : {}'.format(len(x_train))) print('테스트 샘플의 수 : {}'.format(len(x_test))) # display first data set of train and test print(x_train[0]) print(x_test[0]) # display first target class of train print(y_train[0]) # display first target class of test print(y_test[0])
https://arxiv.org/pdf/1310.4546.pdf Built on Reuters dataset of articles """ import random from tqdm import tqdm from tensorflow.keras.datasets import reuters import numpy as np # SECTION 1 # Get corpus vocab_size=10000 (train, _), (test, _) = reuters.load_data(path="reuters.npz", num_words=vocab_size, test_split=0.2, seed=1337) # A dictionary mapping words to an integer index word_index = reuters.get_word_index() # The first indices are reserved word_index = {k:(v+3) for k,v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])