def reuters_raw(max_features=20000): index_offset = 3 # word index offset (x_train, y_train), (x_test, y_test) = reuters.load_data( num_words=max_features, index_from=index_offset ) x_train = x_train y_train = y_train.reshape(-1, 1) x_test = x_test y_test = y_test.reshape(-1, 1) word_to_id = reuters.get_word_index() word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} x_train = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_train) ) x_test = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test) ) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) return (x_train, y_train), (x_test, y_test)
def _get_word_index(self): word_index = reuters.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 return word_index
plt.ylabel('number of samples') plt.show() # few EDA: a variation of countplot fig, axe = plt.subplots(ncols=1) fig.set_size_inches(12, 5) sns.countplot(y_train) # few EDA: unique elements, element counts unique_elements, counts_elements = np.unique(y_train, return_counts=True) print("각 클래스 빈도수 : ") print(np.asarray((unique_elements, counts_elements))) # download reuter word index word_index = reuters.get_word_index(path="reuters_word_index.json") # word index of 'the' word_index['the'] # word index of 'it' word_index['it'] # index to word simulation index_to_word = {index + 3: word for word, index in word_index.items()} # retriving words with its indexes print(index_to_word[4]) print(index_to_word[16]) # inserting "Secret" index tags
def save_reuters(): OUT_DIR = 'reuters' # Load data from keras API (x_train, y_train), (x_test, y_test) = reuters.load_data() # get word index word_index = reuters.get_word_index() # make dictionary to reference index word_list = {(value + 3): key for key, value in word_index.items()} INVALID_STR = '#$%' # define invalid string to remove them later word_list[0] = INVALID_STR word_list[1] = INVALID_STR word_list[2] = INVALID_STR # define class names from ex: https://github.com/keras-team/keras/issues/12072 class_list = [ 'cocoa', 'grain', 'veg-oil', 'earn', 'acq', 'wheat', 'copper', 'housing', 'money-supply', 'coffee', 'sugar', 'trade', 'reserves', 'ship', 'cotton', 'carcass', 'crude', 'nat-gas', 'cpi', 'money-fx', 'interest', 'gnp', 'meal-feed', 'alum', 'oilseed', 'gold', 'tin', 'strategic-metal', 'livestock', 'retail', 'ipi', 'iron-steel', 'rubber', 'heat', 'jobs', 'lei', 'bop', 'zinc', 'orange', 'pet-chem', 'dlr', 'gas', 'silver', 'wpi', 'hog', 'lead' ] # make train/test dirs and class dirs for cid, class_name in enumerate(class_list): os.makedirs(os.path.join(OUT_DIR, 'train', '{:02d}_{}'.format(cid, class_name)), exist_ok=True) os.makedirs(os.path.join(OUT_DIR, 'test', '{:02d}_{}'.format(cid, class_name)), exist_ok=True) # convert train data for num, (x_data, y_data) in enumerate(zip(x_train, y_train)): # make file path fpath = os.path.join(OUT_DIR, 'train', '{:02d}_{}'.format(y_data, class_list[y_data]), 'train_{:05d}.txt'.format(num)) with open(fpath, mode='w', encoding='utf-8') as f: # convert indices and join words with space word_org = ' '.join(word_list[inx] for inx in x_data) # remove invalid strings word_org = word_org.replace(INVALID_STR + ' ', '') # save text f.write(word_org) # convert test data for num, (x_data, y_data) in enumerate(zip(x_test, y_test)): # make file path fpath = os.path.join(OUT_DIR, 'test', '{:02d}_{}'.format(y_data, class_list[y_data]), 'test_{:05d}.txt'.format(num)) with open(fpath, mode='w', encoding='utf-8') as f: # convert indices and join words with space word_org = ' '.join(word_list[inx] for inx in x_data) # remove invalid strings word_org = word_org.replace(INVALID_STR + ' ', '') # save text f.write(word_org) print() print('Saved to ' + OUT_DIR + '/') print()
# Commented out IPython magic to ensure Python compatibility. from tensorflow.keras.datasets import reuters # %matplotlib inline import matplotlib.pyplot as plt import numpy as np (X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=1000, test_split=0.2) print('Train data : {}'.format(len(X_train))) print('Test data : {}'.format(len(X_test))) num_classes = max(Y_train) + 1 print('class : {}'.format(num_classes)) word_index = reuters.get_word_index() """우리는 11,288개에 달하는 news를 46개의 topic에 맞춰서 classify해볼 것이다. 그래서 Word one hot encoding 및 모델 생성을 위한 모듈을 위에서 import 해주었고 아래 그림에서 데이터 전처리를 수행한다. Pad_sequences 등의 전처리 모듈을 이용한다. 또한 모델의 학습에 overfitting을 방지하는 EarlyStopping도 추가해준다. 이후 LSTM을 이용하여 모델을 생성하고 모델을 학습한다.""" index_word = {} for key, value in word_index.items(): index_word[value] = key from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import Dense, LSTM, Embedding from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint max_len = 100 X_train = pad_sequences(X_train, maxlen=max_len) X_test = pad_sequences(X_test, maxlen=max_len)
def get_text(self, data): word_id_index = reuters.get_word_index() id_word_index = dict([(id, value) for (value, id) in word_id_index.items()]) return ' '.join([id_word_index.get(i - 3, '?') for i in data])
import numpy as np import string import textwrap from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras.layers import Embedding, Dense, Dropout, Input, LSTM, GRU, Bidirectional, Flatten from tensorflow.keras.models import Model from tensorflow.keras.regularizers import l2 """Uses the [Reuters newswire](https://keras.io/api/datasets/reuters/) classification dataset, which has text paired with 46 topics as labels. You can see what these labels represent [here](https://martin-thoma.com/nlp-reuters/).""" (X_train, y_train), (_, _) = reuters.load_data() # https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset # Needed to encode our own reviews later word_dict = reuters.get_word_index() word_dict = {k: (v + 3) for k, v in word_dict.items()} word_dict["<PAD>"] = 0 word_dict["<START>"] = 1 word_dict["<UNK>"] = 2 word_dict["<UNUSED>"] = 3 vocab_size = len(word_dict.keys()) # Needed to decode training data into readable text inverse_word_dict = {value: key for key, value in word_dict.items()} X_train = np.array(X_train) X_train = pad_sequences(X_train)
def decode_review(index_review): word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # i - 3 because 0, 1, 2 are reserved indices for "padding", "start of sequence" and "unknown" return ' '.join([reverse_word_index.get(i - 3, '?') for i in index_review])
def run_reuters(): # Extract useful data from dataset print('Extracting the Reuters dataset') (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) print(f"There are {len(train_data)} training examples and {len(test_data)} testing examples") # Illustration of the input data print( f'In this dataset the labels denote the topic of the piece. There are 46 topics represented, each one is ' f'mutually exclusive.\nHaving taken the top 10,000 most-used words no word index will exceed 10,000.\n' f'Max Index = {max([max(sequence) for sequence in train_data])}') print( f"For the sake of illustration, let's decode an article back to English (not being printed for easier reading)") word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # print(decoded_review) # Encoding the inputs print("In order to pass these lists of integers into a neural network we must first encode them as tensors of " "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.") def vectorise_sequences(sequences, dimension=10000): ret = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): ret[i, sequence] = 1 if i < 1: print(f"\n{sequence} => {ret[i]}\n") return ret x_train = vectorise_sequences(train_data) x_test = vectorise_sequences(test_data) print("For the labels this time around, there are a few options. A very common option is one-hot-encoding, for " "which Keras has an in-built function (a manual version is included in the code for educational purposes)") def to_one_hot(labels, dimension=46): ret = np.zeros((len(labels), dimension)) for i, label in enumerate(labels): ret[i, label] = 1 return ret one_hot_train_labels = to_categorical(train_labels) one_hot_test_labels = to_categorical(test_labels) # Design and compile the model print("Now to build the network, this time using parameters with greater configurability") model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(10000,))) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(46, activation='softmax')) model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=[metrics.categorical_accuracy]) # Divide the training data print("Creating a validation set for greater insight during training") x_val = x_train[:1000] # Taking the 1st 1000 samples for validation partial_x_train = x_train[1000:] # Leaving everything from 1000 onwards for training y_val = one_hot_train_labels[:1000] # Taking the 1st 1000 labels for validation partial_y_train = one_hot_train_labels[1000:] # Leaving everything from 1000 onwards for training # Train the model print("Begin training the model:") history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) history_dict = history.history print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. " f"The keys are: {history_dict.keys()}") # ['loss', 'categorical_accuracy', 'val_loss', 'val_categorical_accuracy'] # Prepare to plot the training and validation information loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] acc_values = history_dict['categorical_accuracy'] val_acc_values = history_dict['val_categorical_accuracy'] epochs = range(1, len(history_dict['categorical_accuracy']) + 1) plt.plot(epochs, loss_values, 'bo', label='Training Loss') plt.plot(epochs, val_loss_values, 'b', label='Validation Loss') plt.title('Training and Validation Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() plt.plot(epochs, acc_values, 'bo', label='Training Accuracy') plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy') plt.title('Training and Validation Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # Evaluate the model print("\nAfter reviewing each plot, evaluate the performance of the model on new data") results = model.evaluate(x_test, one_hot_test_labels) print(f"Evaluation Results: Loss = {results[0]} Accuracy = {results[1] * 100}%")
# plt.hist([len(s) for s in x_train], bins=50) # plt.show() # y분포 unique_elemnets, count_elements = np.unique(y_train, return_counts=True) print('y분포 : ', dict(zip(unique_elemnets, count_elements))) # dict 딕셔너리 형태, zip 합치는것 ex) 0과 55, 1과 432 print('=============================================================') # plt.hist(y_train, bins = 46) # plt.show() # x의 단어 분포 word_to_index = reuters.get_word_index() # keras의 datasets에서만 사용가능 print(word_to_index) print(type(word_to_index)) print('=============================================================') # 키와 벨류를 교체 index_to_word = {} for key, value in word_to_index.items(): index_to_word[value] = key # 키와 벨류를 교체후 print(index_to_word) print(index_to_word[1]) # the print(len(index_to_word)) # 30979 print(index_to_word[30979]) # northerly
from tensorflow.keras.datasets import reuters (train_data,train_labels),(test_data,test_labels) = reuters.load_data(num_words=10000) class_names = ['cocoa','grain','veg-oil','earn','acq','wheat','copper','housing','money-supply', 'coffee','sugar','trade','reserves','ship','cotton','carcass','crude','nat-gas', 'cpi','money-fx','interest','gnp','meal-feed','alum','oilseed','gold','tin', 'strategic-metal','livestock','retail','ipi','iron-steel','rubber','heat','jobs', 'lei','bop','zinc','orange','pet-chem','dlr','gas','silver','wpi','hog','lead'] print(f"label {class_names[train_labels[0]]}") print(train_data[0]) print(train_labels[0]) word_to_indx = reuters.get_word_index() print(word_to_indx) inverted_word_index = dict([(value,key) for key,value in word_to_indx.items()]) text_news = " ".join(inverted_word_index.get(i ,"?") for i in train_data[0]) text_news def bag_of_words(text_samples, max_elements=10000): output = np.zeros(shape=(len(text_samples),max_elements)) for i,word in enumerate(text_samples): output[i,word] = 1 return output x_train = bag_of_words(train_data) x_test = bag_of_words(test_data)
def load_data_set(type, max_len, vocab_size, batch_size): """ Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification Args: type : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set max_len: {int} timesteps used for padding vocab_size: {int} size of the vocabulary batch_size: batch_size Returns: train_loader: {torch.Dataloader} train dataloader x_test_pad : padded tokenized test_data for cross validating y_test : y_test word_to_id : {dict} words mapped to indices """ INDEX_FROM = 2 if not bool(type): NUM_WORDS = vocab_size # only use top 1000 words dataset = pd.read_csv('df_raw_text2.csv') dataset = dataset[~dataset.TEXTOS.isnull()] dataset.drop_duplicates(subset="DOCS_ID", keep='first', inplace=True) dataset = dataset[dataset.V_AMB != 's/d'] dataset.drop_duplicates(subset="TEXTOS", keep='first', inplace=True) texts1 = [' '.join(txt.splitlines()) for txt in dataset['TEXTOS']] # remove /n texts2 = [preprocess(txt) for txt in texts1 ] # preprocess. ie. remove punct., lowercase, etc. t = Tokenizer(num_words=10000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ", char_level=False, oov_token=1) t.fit_on_texts(texts2) x_test_seq = t.texts_to_sequences(texts2) word_to_id = {k: v for k, v in t.word_index.items()} word_to_id["<PAD>"] = 0 word_to_id["<UNK>"] = 1 id_to_word = {value: key for key, value in word_to_id.items()} y = [1 if elem == 'si' else 0 for elem in dataset.VIOLENCIA_DE_GENERO] x = np.array(x_test_seq) y = np.array(y) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.75, random_state=1) x_train_pad = pad_sequences(x_train, maxlen=max_len) x_test_pad = pad_sequences(x_test, maxlen=max_len) train_data = data_utils.TensorDataset( torch.from_numpy(x_train_pad).type(torch.LongTensor), torch.from_numpy(y_train).type(torch.DoubleTensor)) train_loader = data_utils.DataLoader(train_data, batch_size=batch_size, drop_last=True) return train_loader, x_test_pad, y_test, word_to_id else: from tensorflow.keras.datasets import reuters train_set, test_set = reuters.load_data(path="reuters.npz", num_words=vocab_size, skip_top=0, index_from=INDEX_FROM) x_train, y_train = train_set[0], train_set[1] x_test, y_test = test_set[0], test_set[1] word_to_id = reuters.get_word_index(path="reuters_word_index.json") word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id['<EOS>'] = 3 id_to_word = {value: key for key, value in word_to_id.items()} x_train_pad = pad_sequences(x_train, maxlen=max_len) x_test_pad = pad_sequences(x_test, maxlen=max_len) train_data = data_utils.TensorDataset( torch.from_numpy(x_train_pad).type(torch.LongTensor), torch.from_numpy(y_train).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data, batch_size=batch_size, drop_last=True) return train_loader, train_set, test_set, x_test_pad, word_to_id