def data_extract(): global numwords print("1. Start data_extract") print('Load review (%d)...' % (num_review)) datas = gd.get_dataset('../dataset/movie_data.csv', 'UTF8', num_review) w2i, i2w, freqtable, train_set = gd.get_unigram_voca(datas, num_review) numwords = len(w2i)
def initialize_train_dataset(self): train_dataset, _ = get_dataset('train', self.vocab) train_dataset = train_dataset.shuffle(buffer_size=1000, seed=321) train_dataset = train_dataset.padded_batch( FLAGS.batch_size, padded_shapes=([FLAGS.num_timesteps], (), ()), drop_remainder=True).repeat().prefetch(3 * FLAGS.batch_size) train_dataset = train_dataset.make_one_shot_iterator() self.train_sentence, self.train_sentence_len, self.train_label = train_dataset.get_next( )
def initialize_dev_dataset(self): dev_dataset, _ = get_dataset('dev', self.vocab) dev_dataset = dev_dataset.padded_batch( FLAGS. batch_size, # because default padding_value and pad_idx are 0 padded_shapes=([FLAGS.num_timesteps], (), ())).prefetch(FLAGS.batch_size * 3) dev_iterator = dev_dataset.make_one_shot_iterator() self.dev_init_op = dev_iterator.make_initializer(dev_dataset, name='dev_init') self.dev_sentence, self.dev_sentence_len, self.dev_label = dev_iterator.get_next( )
def main(): start = timeit.default_timer() datas = gd.get_dataset('../dataset/movie_data.csv', 'UTF8', num_review) print("train data : bigram_tokenizing...") s2i, i2class, bigram2i, s_bag = bigram_tokenize(datas) input_set = [] test_lists = [] for k in s_bag.keys(): input_set.append([i2class[k], k]) print("test data : bigram_tokenizing...") test_set = test_bigram_tokenize(datas, bigram2i) print() print("training...") #emb1.shape = (N, D), emb2.shape = (4, D) emb1, emb2 = trainer(input_set, bigram2i, s_bag, dimension=DIMEN, learning_rate=RATE, epoch=EPOCH) print("testing...") print("# of tesing samples") print(len(test_set)) print() acc = [] f = open("result.txt", "w") i = 0 for test in test_set: i += 1 _, _, _, predict, list_ = classification(test[0], test[1], emb1, emb2) f.write("sentence %d : %s\npredict : %d, real : %d\n" %(i, test[2], list_[0], list_[1])) if predict is 1: acc.append(1) else: acc.append(0) stop = timeit.default_timer() print("==============================================") print("train_data : all") print("test_data : all") print("# of bigram : %d" %(len(bigram2i))) print("epoch : %d" %EPOCH) print("dimen : %d" %DIMEN) print("learning_rate : %.3f" %RATE) print("computing time : %.2f" %(stop-start)) print("correct / total : %d / %d" %(sum(acc), len(acc))) print("test_set accuracy : %.2f" %(sum(acc)/len(acc) * 100)) print("==============================================")
def initialize_vocab(self): _, self.vocab = get_dataset('train') self.vocab_freqs = self.vocab.get_freq() self.vocab_size = self.vocab.size
def classification(): datas = gd.get_dataset('../dataset/movie_data.csv', 'UTF8', num_review)
plt.xlabel('Num of Epochs') plt.ylabel('Accuracy') plt.legend(['train', 'validation'], loc='best') plt.subplot(1, 2, 2) plt.plot(np.arange(1, len(history['loss']) + 1), history['loss'], 'r') plt.plot(np.arange(1, len(history['val_loss']) + 1), history['val_loss'], 'g') plt.xticks(np.arange(0, epochs + 1, epochs / 10)) plt.title('Training Loss vs. Validation Loss') plt.xlabel('Num of Epochs') plt.ylabel('Loss') plt.legend(['train', 'validation'], loc='best') plt.show() X_train, X_test, y_train, y_test = get_dataset() model = get_model(num_classes=5) model = train_model(model, X_train, X_test, y_train, y_test, batch_size=32, num_epochs=50) save_model(model)