def data_extract():
    global numwords
    print("1. Start data_extract")
    print('Load review (%d)...' % (num_review))
    datas = gd.get_dataset('../dataset/movie_data.csv', 'UTF8', num_review)
    w2i, i2w, freqtable, train_set = gd.get_unigram_voca(datas, num_review)
    numwords = len(w2i)
示例#2
0
 def initialize_train_dataset(self):
     train_dataset, _ = get_dataset('train', self.vocab)
     train_dataset = train_dataset.shuffle(buffer_size=1000, seed=321)
     train_dataset = train_dataset.padded_batch(
         FLAGS.batch_size,
         padded_shapes=([FLAGS.num_timesteps], (), ()),
         drop_remainder=True).repeat().prefetch(3 * FLAGS.batch_size)
     train_dataset = train_dataset.make_one_shot_iterator()
     self.train_sentence, self.train_sentence_len, self.train_label = train_dataset.get_next(
     )
示例#3
0
 def initialize_dev_dataset(self):
     dev_dataset, _ = get_dataset('dev', self.vocab)
     dev_dataset = dev_dataset.padded_batch(
         FLAGS.
         batch_size,  # because default padding_value and pad_idx are 0
         padded_shapes=([FLAGS.num_timesteps], (),
                        ())).prefetch(FLAGS.batch_size * 3)
     dev_iterator = dev_dataset.make_one_shot_iterator()
     self.dev_init_op = dev_iterator.make_initializer(dev_dataset,
                                                      name='dev_init')
     self.dev_sentence, self.dev_sentence_len, self.dev_label = dev_iterator.get_next(
     )
示例#4
0
def main():
    start = timeit.default_timer()
    datas = gd.get_dataset('../dataset/movie_data.csv', 'UTF8', num_review)

    print("train data : bigram_tokenizing...")
    s2i, i2class, bigram2i, s_bag = bigram_tokenize(datas)

    input_set = []
    test_lists = []
    for k in s_bag.keys():
        input_set.append([i2class[k], k])

    print("test data : bigram_tokenizing...")
    test_set = test_bigram_tokenize(datas, bigram2i)

    print()
    print("training...")
    #emb1.shape = (N, D), emb2.shape = (4, D)
    emb1, emb2 = trainer(input_set, bigram2i, s_bag, dimension=DIMEN, learning_rate=RATE, epoch=EPOCH)

    print("testing...")
    print("# of tesing samples")
    print(len(test_set))
    print()
    
    acc = []
    f = open("result.txt", "w")
    i = 0 
    for test in test_set:
        i += 1
        _, _, _, predict, list_ = classification(test[0], test[1], emb1, emb2)
        f.write("sentence %d : %s\npredict : %d, real : %d\n" %(i, test[2], list_[0], list_[1]))
        if predict is 1:
            acc.append(1)
        else:
            acc.append(0)
    
    stop = timeit.default_timer()
    print("==============================================")
    print("train_data : all")
    print("test_data : all")
    print("# of bigram : %d" %(len(bigram2i)))
    print("epoch : %d" %EPOCH)
    print("dimen : %d" %DIMEN)
    print("learning_rate : %.3f" %RATE)
    print("computing time : %.2f" %(stop-start))
    print("correct / total : %d / %d" %(sum(acc), len(acc)))
    print("test_set accuracy : %.2f" %(sum(acc)/len(acc) * 100))
    print("==============================================")
示例#5
0
 def initialize_vocab(self):
     _, self.vocab = get_dataset('train')
     self.vocab_freqs = self.vocab.get_freq()
     self.vocab_size = self.vocab.size
def classification():
    datas = gd.get_dataset('../dataset/movie_data.csv', 'UTF8', num_review)
示例#7
0
    plt.xlabel('Num of Epochs')
    plt.ylabel('Accuracy')
    plt.legend(['train', 'validation'], loc='best')

    plt.subplot(1, 2, 2)
    plt.plot(np.arange(1, len(history['loss']) + 1), history['loss'], 'r')
    plt.plot(np.arange(1,
                       len(history['val_loss']) + 1), history['val_loss'], 'g')
    plt.xticks(np.arange(0, epochs + 1, epochs / 10))
    plt.title('Training Loss vs. Validation Loss')
    plt.xlabel('Num of Epochs')
    plt.ylabel('Loss')
    plt.legend(['train', 'validation'], loc='best')

    plt.show()


X_train, X_test, y_train, y_test = get_dataset()

model = get_model(num_classes=5)

model = train_model(model,
                    X_train,
                    X_test,
                    y_train,
                    y_test,
                    batch_size=32,
                    num_epochs=50)

save_model(model)