def __build_FFNN_layers2(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()): if len(cls.graph_nodes) == 0: log.info('create tensorflow graph...') watch.start('create tensorflow graph') log.info('n_features: %s' % n_features) log.info('n_classes: %s' % n_classes) log.info('n_hidden1: %s' % n_hidden1) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # two characters Y = tf.placeholder(tf.float32, [None, n_classes], name='Y') W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.sigmoid(tf.matmul(X, W1) + b1, name='layer1') W2 = tf.Variable(tf.random_normal([n_hidden1, n_classes]), name='W2') b2 = tf.Variable(tf.random_normal([n_classes]), name='b2') hypothesis = tf.sigmoid(tf.matmul(layer1, W2) + b2, name='hypothesis') cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost') # cost/loss function # train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost) # Too bad. sentences=10000 + layer=2, 20분, Accuracy: 0.689373, cost: 0.8719 train_step = tf.train.AdamOptimizer( learning_rate=learning_rate ).minimize( cost ) # Very good!! sentences=10000 + layer=2, 10분, accuracy 0.9194, cost: 0.2139 predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32, name='predicted') # 0 <= hypothesis <= 1 accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy') watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.\n') cls.graph_nodes = { 'hypothesis': hypothesis, 'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost } return cls.graph_nodes
def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()): # TODO: 2 layers log.info('\nbuild_FFNN') if len(cls.graph_nodes) == 0: n_hidden3 = n_hidden2 = n_hidden1 log.info('create tensorflow graph...') watch.start('create tensorflow graph') log.info('n_features: %s' % n_features) log.info('n_classes: %s' % n_classes) log.info('n_hidden1: %s' % n_hidden1) log.info('n_hidden2: %s' % n_hidden2) log.info('n_hidden3: %s' % n_hidden3) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # two characters Y = tf.placeholder(tf.float32, [None, n_classes], name='Y') # W1 = tf.Variable(tf.truncated_normal([n_features, n_hidden1], mean=0.0, stddev=0.1), name='W1') # b1 = tf.Variable(tf.constant(0.1, shape=[n_hidden1]), name='b1') W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.nn.relu(tf.matmul(X, W1) + b1, name='layer1') W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2') b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2') layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2, name='layer2') W3 = tf.Variable(tf.random_normal([n_hidden2, n_hidden3]), name='W3') b3 = tf.Variable(tf.random_normal([n_hidden3]), name='b3') layer3 = tf.nn.relu(tf.matmul(layer2, W3) + b3, name='layer3') W4 = tf.Variable(tf.random_normal([n_hidden3, n_classes]), name='W4') b4 = tf.Variable(tf.random_normal([n_classes]), name='b4') y_hat = tf.add(tf.matmul(layer3, W4), b4, name='y_hat') # cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost') # cost/loss function cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=Y), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( cost) # Very Very good!! sentences=10000 + layer=4, 10분, accuracy 0.9294, cost: 0.1839 predicted = tf.cast(y_hat > 0.5, dtype=tf.float32, name='predicted') # 0 <= hypothesis <= 1 accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy') watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.\n') cls.graph_nodes = {'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost} return cls.graph_nodes
def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil(), layers=4): log.info('\nbuild_FFNN(layers=%s)' % layers) if layers == 2: return cls.__build_FFNN_layers2(n_features, n_classes, n_hidden1, learning_rate, watch=watch) else: return cls.__build_FFNN_layers4(n_features, n_classes, n_hidden1, learning_rate, watch=watch)
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, learning_rate=0.01, early_stop_cost=0.001): ngram = left_gram + right_gram n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram)) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for name, data_file, total, dataset_file, to_one_hot_vector in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) features, labels = [], [] with gzip.open(data_file, 'rt', encoding='utf8') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch) train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = 10 # max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('learn...') log.info('total: %s' % NumUtil.comma_str(train.size)) watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch}) # if step % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels}) log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.\n') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info('batch_size: %s' % batch_size) log.info(watch.summary()) log.info('\n')
log.info('%s -> %s' % (features, labels)) log.info('in : "%s"' % s) log.info('out: "%s"' % WordSpacing.spacing(s.replace(' ', ''), labels)) log.info('sample testing OK.\n') if not os.path.exists(model_file + '.index') or not os.path.exists(model_file + '.meta'): if n_train >= int('100,000'.replace(',', '')): SlackUtil.send_message('%s start (max_sentences=%s, left_gram=%s, right_gram=%.1f)' % (sys.argv[0], n_train, left_gram, right_gram)) WordSpacing.learning(total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=n_hidden1, learning_rate=learning_rate, early_stop_cost=early_stop_cost) if n_train >= int('100,000'.replace(',', '')): SlackUtil.send_message('%s end (max_sentences=%s, left_gram=%s, right_gram=%.1f)' % (sys.argv[0], n_train, left_gram, right_gram)) log.info('chek result...') watch = WatchUtil() watch.start('read sentences') sentences = [] # '아버지가 방에 들어 가신다.', '가는 말이 고와야 오는 말이 곱다.'] max_test_sentences = 100 if n_train >= int('100,000'.replace(',', '')): sentences_file = test_sentences_file else: sentences_file = train_sentences_file with gzip.open(sentences_file, 'rt', encoding='utf8') as f: for i, line in enumerate(f, 1): if len(sentences) >= max_test_sentences: break
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector, n_hidden1, learning_rate, dropout_keep_rate, early_stop_cost=0.001): n_features = len(features_vector) * window_size # number of features = 17,382 * 10 log.info('load characters list...') log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size)) log.info('train_file: %s' % train_file) log.info('valid_file: %s' % valid_file) log.info('test_file: %s' % test_file) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') # FIXME: out of memory (1M sentences) log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) log.info('noise_rate: %s' % noise_rate) features, labels = [], [] with gzip.open(data_file, 'rt') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) sentence = line.strip() for start in range(0, len(sentence) - window_size + 1): # 문자 단위로 노이즈(공백) 생성 chars = sentence[start: start + window_size] for idx in range(len(chars)): noised_chars = StringUtil.replace_with_index(chars, ' ', idx) features.append(noised_chars) labels.append(chars) log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # log.info('noise_sampling: %s' % noise_sampling) # for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성 # for start in range(0, len(sentence) - window_size + 1): # chars = sentence[start: start + window_size] # noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True) # if chars == noised_chars: # continue # if i % check_interval == 0 and nth_sample == 0: # log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # features.append(noised_chars) # labels.append(chars) # print('dataset features:', features) # print('dataset labels:', labels) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1, learning_rate, watch) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('') log.info('learn...') log.info('total_epoch: %s' % total_epoch) log.info('train.size (total features): %s' % NumUtil.comma_str(train.size)) log.info('check_interval: %s' % check_interval) log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1): if valid_cost < early_stop_cost: break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate}) # if nth_train % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0}) log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.') log.info('') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('') log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') log.info(watch.summary()) log.info('')
def build_DAE(cls, n_features, window_size, noise_rate, n_hidden1, learning_rate, watch=WatchUtil()): if len(cls.graph) == 0: log.info('') log.info('create tensorflow graph...') watch.start('create tensorflow graph') features_vector_size = n_features // window_size log.info('n_features: %s' % n_features) log.info('window_size: %s' % window_size) log.info('features_vector_size: %s' % features_vector_size) log.info('noise_rate: %.1f' % noise_rate) log.info('n_hidden1: %s' % n_hidden1) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # shape=(batch_size, window_size * feature_vector.size) Y = tf.placeholder(tf.float32, [None, n_features], name='Y') # shape=(batch_size, window_size * feature_vector.size) dropout_keep_prob = tf.placeholder(tf.float32) # layers = 3 # n_hidden2 = n_hidden1 # W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') # b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') # layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1') # layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout') # # W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2') # b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2') # layer2 = tf.nn.sigmoid(tf.matmul(layer1_dropout, W2) + b2, name='layer2') # layer2_dropout = tf.nn.dropout(layer2, dropout_keep_prob, name='layer2_dropout') # # W3 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W3') # b3 = tf.Variable(tf.random_normal([n_features]), name='b3') # y_hat = tf.add(tf.matmul(layer2_dropout, W3), b3, name='y_hat') # layers = 2 W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1') layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout') W2 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W2') b2 = tf.Variable(tf.random_normal([n_features]), name='b2') y_hat = tf.add(tf.matmul(layer1_dropout, W2), b2, name='y_hat') # shape=(batch_size, window_size * feature_vector.size) labels_hat = tf.reshape(y_hat, shape=(-1, window_size, features_vector_size)) # shape=(batch_size, window_size, feature_vector.size) labels = tf.reshape(Y, shape=(-1, window_size, features_vector_size)) # shape=(batch_size, window_size, feature_vector.size) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=labels_hat, labels=labels), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) accuracy = tf.reduce_mean(tf.cast(tf.abs(tf.nn.softmax(y_hat) - Y) < 0.1, dtype=tf.float32), name='accuracy') # log.debug('X:', X) # log.debug('Y:', Y) # log.debug('y_hat:', y_hat) # log.debug('labels_hat:', labels_hat) # log.debug('labels:', labels) # log.debug('cost:', cost) # log.debug('accuracy:', accuracy) watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.') log.info('') cls.graph = {'X': X, 'Y': Y, 'dropout_keep_prob': dropout_keep_prob, 'train_step': train_step, 'cost': cost, 'y_hat': y_hat, 'accuracy': accuracy, } return cls.graph['X'], cls.graph['Y'], cls.graph['dropout_keep_prob'], \ cls.graph['train_step'], cls.graph['cost'], cls.graph['y_hat'], cls.graph['accuracy']
if is_training: # training x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary = create_graph(model_name, scope_name, verbose=False) train_x_batch, train_y_batch = input_pipeline([train_file], batch_size=batch_size, delim='\t', splits=3) valid_x_batch, valid_y_batch = input_pipeline([valid_file], batch_size=n_valid, delim='\t', splits=3) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=None) train_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/train', sess.graph) valid_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/valid', sess.graph) coordinator = tf.train.Coordinator() # coordinator for enqueue threads threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) # start filename queue batch_count = math.ceil(n_train / batch_size) # batch count for one epoch try: watch = WatchUtil() stop_timer = TimerUtil(interval_secs=total_train_time) valid_timer = TimerUtil(interval_secs=valid_check_interval) watch.start() stop_timer.start() valid_timer.start() nth_batch, min_valid_epoch, min_valid_cost = 0, 0, 1e10 epoch, running = 0, True while running: epoch += 1 for i in range(1, batch_count + 1): if stop_timer.is_over(): running = False break
def train(self, iterations: int, batch: int, embedding: Word2VecEmbedding, args: argparse.Namespace) -> str: batches_in_epoch = int(numpy.ceil( len(self.dataloader.dataset) / batch)) total_batches = batches_in_epoch * iterations nth_total_batch = 0 log.info(f'batches_in_epoch: {batches_in_epoch}') log.info(f'total_batches: {total_batches}') watch = WatchUtil(auto_stop=False) watch.start() best_loss = float("inf") first_epoch, last_epoch = self.epoch + 1, self.epoch + iterations + 1 last_embedding_file = None log.info(Word2VecEmbedding.get_filenpath(args)) for self.epoch in range(first_epoch, last_epoch): log.info(f"[e{self.epoch:2d}] {self}") loss_list = [] for nth, (iword, owords) in enumerate(self.dataloader, 1): try: loss = self.sgns(iword, owords) except RuntimeError: loss_list = [float('-inf')] break self.optim.zero_grad() loss.backward() self.optim.step() # if nth_batch == 1 and self.scheduler is not None and self.epoch >= self.decay_start_epoch: # TODO: TEST # self.scheduler.step() if self.learning_decay != 0: PytorchUtil.set_learning_rate(self.optim, self.epoch, gamma=self.learning_decay, base_lr=self.init_lr, min_lr=1e-10, decay_start=2, decay_interval=3) lr = PytorchUtil.get_learning_rate(self.optim) _, negatives = owords.size() real_loss = loss.data[0] / float(negatives) loss_list.append(real_loss) nth_total_batch += 1 progressed = nth_total_batch / total_batches seconds_per_batch = float( watch.elapsed()) / float(nth_total_batch) remain_batches = total_batches - nth_total_batch remain_secs = int(seconds_per_batch * remain_batches) if nth == 1 or nth == batches_in_epoch or nth % 1000 == 0: log.info( f"[e{self.epoch:2d}][b{nth:5d}/{batches_in_epoch:5d}][{progressed*100:.1f}% remain: {DateUtil.secs_to_string(remain_secs)}][window: {self.window}][lr: {lr:.0e}] loss: {real_loss:.7f}" ) total_loss = numpy.mean(loss_list) log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss: {total_loss:.7f}, best_loss: {best_loss:.7f}" ) if total_loss > best_loss or total_loss == float( 'inf') or total_loss == float( '-inf'): # bad loss than before or diverge log.info('') log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss > best_loss BREAK" ) log.info('') break else: if best_loss < total_loss: best_loss = total_loss log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save()..." ) args.epoch = self.epoch last_embedding_file = embedding.save( idx2vec=trainer.embedding, filepath=Word2VecEmbedding.get_filenpath(args)) log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save() OK. {os.path.basename(embedding.filepath)}" ) return last_embedding_file
default=Word2VecEmbedding.SUBSAMPLE, type=float, help="subsample threshold (default: 1e-5)") parser.add_argument('--learning_rate', default=Word2VecEmbedding.LEARNING_RATE, type=float, help="learning rate for AdamOptimizer") parser.add_argument('--learning_decay', default=Word2VecEmbedding.LEARNING_DECAY, type=float, help="exponential decay gamma (default: 0.0=no decay)") args = parser.parse_args() log.info(args) watch = WatchUtil(auto_stop=True) try: log.info(f'load {args.corpus_file} ...') watch.start() corpus = Word2VecCorpus.load(filepath=args.corpus_file) log.info( f'load {args.corpus_file} OK. (elapsed: {watch.elapsed_string()})') log.info(corpus.vocab) if len(corpus.vocab) > 1e5: # out of memory (11GB GPU memory) args.device_no = None log.info('') log.info(args) log.info('')
train_writer = tf.summary.FileWriter( TENSORBOARD_LOG_DIR + '/train', sess.graph) valid_writer = tf.summary.FileWriter( TENSORBOARD_LOG_DIR + '/valid', sess.graph) coordinator = tf.train.Coordinator( ) # coordinator for enqueue threads threads = tf.train.start_queue_runners( sess=sess, coord=coordinator) # start filename queue batch_count = math.ceil( n_train / batch_size) # batch count for one epoch try: watch = WatchUtil() stop_timer = TimerUtil( interval_secs=total_train_time) valid_timer = TimerUtil( interval_secs=valid_check_interval) watch.start() stop_timer.start() valid_timer.start() nth_batch, min_valid_epoch, min_valid_cost = 0, 0, 1e10 epoch, running = 0, True while running: epoch += 1 for i in range(1, batch_count + 1): if stop_timer.is_over(): running = False
log.info('weights_initializer: %s' % weights_initializer.__name__) log.info('learning_rate: %.4f' % learning_rate) log.info('train_time: %s' % train_time) how_many_trains = 3 if train_time < 10 else 1 log.info('how_many_trains: %s' % how_many_trains) for _ in range(how_many_trains): time.sleep(1) tf.reset_default_graph() # Clears the default graph stack and resets the global default graph. tf.set_random_seed(7942) # 3. 결과를 규칙적으로 만들자. (cost: 600-700) scope_name = '%s.%s' % (func.__name__, DateUtil.current_yyyymmdd_hhmmss()) x, y, y_hat, cost, rsme, train_step, summary = build_graph(scope_name, n_features, n_hiddens, n_classes, learning_rate, activation=activation, weights_initializer=weights_initializer, bias_value=bias_value) try: watch = WatchUtil() model_file_saved = False model_file = os.path.join(MODELS_DIR, '%s_%s/model' % (os.path.basename(__file__.replace('.py', '')), func.__name__)) model_dir = os.path.dirname(model_file) # log.info('model_file: %s' % model_file) if not os.path.exists(model_dir): # log.info('model_dir: %s' % model_dir) os.makedirs(model_dir) config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) saver = tf.train.Saver() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) train_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/train', sess.graph)
allow_growth=True)) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=None) if is_training: # training train_writer = tf.summary.FileWriter( TENSORBOARD_LOG_DIR + '/train', sess.graph) valid_writer = tf.summary.FileWriter( TENSORBOARD_LOG_DIR + '/valid', sess.graph) batch_count = math.ceil( n_train / batch_size) # batch count for one epoch try: watch = WatchUtil() stop_timer = TimerUtil( interval_secs=total_train_time) valid_timer = TimerUtil( interval_secs=valid_check_interval) watch.start() stop_timer.start() valid_timer.start() nth_batch, min_valid_epoch, min_valid_cost = 0, 0, 1e10 epoch, running = 0, True while running: epoch += 1 for _x_batch, _y_batch in next_batch_in_memory( 'train'): if stop_timer.is_over():
def learning(cls, sentences_file, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, max_sentences=0, learning_rate=0.01, layers=2): ngram = left_gram + right_gram n_features = len( features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len( labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join( KO_WIKIPEDIA_ORG_DATA_DIR, 'datasets', 'ko.wikipedia.org.dataset.sentences=%d.left=%d.right=%d.train.gz' % (max_sentences, left_gram, right_gram)) validation_file = train_file.replace('.train.', '.validation.') test_file = train_file.replace('.train.', '.test.') if not os.path.exists(train_file) or not os.path.exists( validation_file) or not os.path.exists(test_file): watch.start('create dataset') log.info('create dataset...') features, labels = [], [] check_interval = min(10000, math.ceil(max_sentences)) log.info('total: %s' % NumUtil.comma_str(max_sentences)) with gzip.open(sentences_file, 'rt') as f: for i, line in enumerate(f, 1): if max_sentences < i: break if i % check_interval == 0: log.info( 'create dataset... %.1f%% readed. data len: %s' % (i / max_sentences * 100, NumUtil.comma_str(len(features)))) _f, _l = WordSpacing.sentence2features_labels( line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name='all') log.info('dataset: %s' % dataset) log.info('create dataset OK.\n') watch.stop('create dataset') watch.start('dataset save') log.info('split to train, test, validation...') datasets = DataSets.to_datasets(dataset, test_rate=0.1, valid_rate=0.1, test_max=10000, valid_max=1000, shuffle=True) train, test, validation = datasets.train, datasets.test, datasets.validation log.info(train) log.info(test) log.info(validation) # log.info('%s %s' % (test.features[0], test.labels[0])) log.info('split to train, test, validation OK.\n') log.info('dataset save... %s' % train_file) train.save(train_file, verbose=True) # save as text log.info('dataset save OK.\n') log.info('dataset save... %s' % validation_file) validation = validation.convert_to_one_hot_vector( verbose=True) # save as vector validation.save(validation_file, verbose=True) log.info('dataset save OK.\n') log.info('dataset save... %s' % test_file) test = test.convert_to_one_hot_vector(verbose=True) test.save(test_file, verbose=True) # save as vector log.info('dataset save OK.\n') watch.stop('dataset save') else: watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, verbose=True) validation = DataSet.load(validation_file, verbose=True) test = DataSet.load(test_file, verbose=True) log.info(train) log.info(validation) log.info(test) log.info('dataset load OK.\n') watch.stop('dataset load') log.info('check samples...') for i, (features_batch, labels_batch) in enumerate( train.next_batch(batch_size=5, to_one_hot_vector=True), 1): if i > 2: break for a, b in zip(features_batch, labels_batch): feature, label = a, b _feature = feature.reshape((ngram, len(features_vector))) chars = ''.join(features_vector.to_values(_feature)) has_space = np.argmax(label) log.info('[%s] %s -> %s, %s (len=%s) %s (len=%s)' % (i, chars, has_space, feature, len(feature), label, len(label))) log.info('check samples OK.\n') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch, layers=layers) train_step, X, Y, cost, hypothesis, predicted, accuracy = graph[ 'train_step'], graph['X'], graph['Y'], graph['cost'], graph[ 'hypothesis'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) n_input = 0 log.info('total: %s' % NumUtil.comma_str(train.size)) log.info('learn...') watch.start('learn') for step, (features_batch, labels_batch) in enumerate( train.next_batch(batch_size=batch_size), 1): n_input += batch_size sess.run(train_step, feed_dict={ X: features_batch, Y: labels_batch }) log.info( '[%s][%.1f%%] validation cost: %.4f' % (NumUtil.comma_str(n_input), n_input / train.size * 100, sess.run(cost, feed_dict={ X: validation.features, Y: validation.labels }))) watch.stop('learn') log.info('learn OK.\n') log.info('evaluate...') watch.start('evaluate...') _hypothesis, _correct, _accuracy = sess.run( [hypothesis, predicted, accuracy], feed_dict={ X: test.features, Y: test.labels }) # Accuracy report watch.stop('evaluate...') log.info('evaluate OK.') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info(watch.summary()) # log.info('hypothesis: %s %s' % (_hypothesis.shape, _hypothesis)) # log.info('correct: %s %s' % (_correct.shape, _correct)) log.info('accuracy: %s %s' % (_accuracy.shape, _accuracy)) log.info('\n')
print('%s -> %s -> %s -> %s -> %s' % (x_train.shape[1], n_hiddens, activation.__name__, n_hiddens, 1)) print('weights_initializer: %s' % weights_initializer.__name__) print('learning_rate: %.4f' % learning_rate) print('train_time: %s' % train_time) how_many_trains = 3 if train_time <= 1 else 1 # 1초 실행하는 경우, 3번 실험 그 외에는 1번 실험. for _ in range(how_many_trains) : # time.sleep(1) tf.reset_default_graph() # 기존 session을 초기화 tf.set_random_seed(7942) # tf.random_normal_initializer 사용하기 때문에 설정 필요 scope_name = '%s.%s' % (func.__name__,DateUtil.current_yyyymmdd_hhmmss()) # graph 겹치지 않게 하기 위해서, func + 날짜 이름으로 설정하는 것을 추천 x, y, y_hat, cost, rsme, train_step, summary = build_graph(scope_name, n_features, n_hiddens, n_classes, learning_rate, activation=activation, weights_initializer=weights_initializer, bias_value=bias_value) try : watch = WatchUtil() model_file_saved = False model_file = os.path.join('%s/workspace/nlp4kor/models/%s_%s/model' % (os.getcwd(), os.path.basename(__name__.replace('.py', '')), func.__name__)) model_dir = os.path.dirname(model_file) # print('model_file: %s' % model_file) if not os.path.exists(model_dir): # print('model_dir: %s' % model_dir) os.makedirs(model_dir) config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) saver = tf.train.Saver() # 최근 5개만 남개 되어서 max_to_keep=None 해야함 with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) train_writer = tf.summary.FileWriter(TENSORBOARD_LOG_DIR + '/train', sess.graph)