def dump_corpus(mongo_url, db_name, collection_name, sentences_file, mongo_query=None, limit=None): """ Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.) :param mongo_url: mongodb://~~~ :param db_name: database name of mongodb :param collection_name: collection name of mongodb :param sentences_file: *.sentence file :param mongo_query: default={} :param limit: :return: """ if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total))) output_dir = os.path.basename(sentences_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with gzip.open(sentences_file, 'wt') as out_f: for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)): # print('url:', row['url']) for c in row['content']: if i % 1000 == 0: print('%.1f%% writed.' % (i / total * 100)) for s in HangulUtil.text2sentences(c['sentences']): if HangulUtil.has_hangul(s): out_f.write(s) out_f.write('\n')
def __to_one_hot_vector(self, features_batch: np.ndarray, labels_batch: np.ndarray, verbose=False): _features, _labels = [], [] check_interval = min(1000, math.ceil(features_batch.shape[0])) for i, (feature_string, label_string) in enumerate(zip(features_batch, labels_batch)): if isinstance(feature_string, str) or isinstance( feature_string, list): feature_v = self.features_vector.to_vectors( feature_string) # to 2 dim feature = np.concatenate(feature_v) # to 1 dim else: feature = self.features_vector.to_vector( feature_string) # to 1 dim if isinstance(label_string, str) or isinstance(label_string, list): label_v = self.labels_vector.to_vectors( label_string) # to 2 dim label = np.concatenate(label_v) # to 1 dim else: label = self.labels_vector.to_vector(label_string) # to 1 dim _features.append(feature) _labels.append(label) if verbose and i % check_interval == 0: log.info( '[%s] to_one_hot_vector %s -> %s, %s (len=%s) %s (len=%s)' % (i, feature_string, label, feature, len(feature), label, len(label))) return np.asarray(_features, dtype=np.int32), np.asarray(_labels, dtype=np.int32)
def encode_noise(cls, s, noise_rate=0.1, noise_with_blank=False, verbose=False): try: hangul_indexs = [ idx for idx, c in enumerate(s) if HangulUtil.is_hangul_char(c) ] if len(hangul_indexs) == 0: return s target_indexs = np.random.choice( hangul_indexs, math.ceil(len(hangul_indexs) * noise_rate), replace=False) _s = list(s) for idx in target_indexs: if noise_with_blank: _s[idx] = ' ' else: c = s[idx] _c = HangulUtil.encode_noise(c) if verbose: log.info('encode: %s -> %s' % (c, _c)) _s[idx] = _c return ''.join(_s) except: return s
def dump_urls(mongo_url, db_name, collection_name, urls_file, mongo_query=None, limit=0): if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total))) output_dir = os.path.basename(urls_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with open(urls_file, 'wt') as out_f: for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)): if i % 1000 == 0: log.info('%s %.1f%% writed.' % (os.path.basename(urls_file), i / total * 100)) out_f.write(row['url']) out_f.write('\n')
def decode_noise(cls, noised_sentence, features_list, labels_list, verbose=False): try: if len(features_list) != len(labels_list) or len( features_list[0]) != len(labels_list[0]): return noised_sentence idx2chars = dict() for feature, label in zip(features_list, labels_list): for off in [ i for i in range(len(feature)) if feature[i] != label[i] ]: for start in [ m.start() for m in re.finditer(feature, noised_sentence) ]: idx = start + off if idx not in idx2chars: idx2chars[idx] = label[off] sentence = list(noised_sentence) for idx, char in idx2chars.items(): if verbose: log.info('denoise: "%s" -> "%s"' % (noised_sentence[idx], char)) sentence[idx] = char return ''.join(sentence) except: return noised_sentence
def __build_FFNN_layers2(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()): if len(cls.graph_nodes) == 0: log.info('create tensorflow graph...') watch.start('create tensorflow graph') log.info('n_features: %s' % n_features) log.info('n_classes: %s' % n_classes) log.info('n_hidden1: %s' % n_hidden1) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # two characters Y = tf.placeholder(tf.float32, [None, n_classes], name='Y') W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.sigmoid(tf.matmul(X, W1) + b1, name='layer1') W2 = tf.Variable(tf.random_normal([n_hidden1, n_classes]), name='W2') b2 = tf.Variable(tf.random_normal([n_classes]), name='b2') hypothesis = tf.sigmoid(tf.matmul(layer1, W2) + b2, name='hypothesis') cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost') # cost/loss function # train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost) # Too bad. sentences=10000 + layer=2, 20분, Accuracy: 0.689373, cost: 0.8719 train_step = tf.train.AdamOptimizer( learning_rate=learning_rate ).minimize( cost ) # Very good!! sentences=10000 + layer=2, 10분, accuracy 0.9194, cost: 0.2139 predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32, name='predicted') # 0 <= hypothesis <= 1 accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy') watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.\n') cls.graph_nodes = { 'hypothesis': hypothesis, 'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost } return cls.graph_nodes
def load(cls, filepath: str, gzip_format=False, max_len=0, verbose=False): filename = os.path.basename(filepath) if gzip_format: f = gzip.open(filepath, 'rb') else: f = open(filepath, 'rb') with f: d = DataSet() d.name, d.size, d.features_vector, d.labels_vector = pickle.load( f), pickle.load(f), pickle.load(f), pickle.load(f) check_interval = min(100000, math.ceil(d.size)) features, labels = [], [] for i in range(d.size): if 0 < max_len <= len(features): break feature, label = pickle.load(f), pickle.load(f) # print('load feature:', feature, 'label:', label) features.append(feature) labels.append(label) if verbose and i % check_interval == 0: log.info('%s %.1f%% loaded.' % (filename, i / d.size * 100)) log.info('%s 100%% loaded.' % filename) d.features = np.asarray(features) d.labels = np.asarray(labels) log.info('%s features shape: %s' % (filename, d.features.shape)) log.info('%s labels shape: %s' % (filename, d.features.shape)) return d
def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil(), layers=4): log.info('\nbuild_FFNN(layers=%s)' % layers) if layers == 2: return cls.__build_FFNN_layers2(n_features, n_classes, n_hidden1, learning_rate, watch=watch) else: return cls.__build_FFNN_layers4(n_features, n_classes, n_hidden1, learning_rate, watch=watch)
def save(self, filepath: str, gzip_format=False, verbose=False): filename = os.path.basename(filepath) if gzip_format: f = gzip.open(filepath, 'wb') else: f = open(filepath, 'wb') with f: for o in [ self.name, self.size, self.features_vector, self.labels_vector, self.labels ]: pickle.dump(o, f) check_interval = min(100000, math.ceil(self.size)) for i, o in enumerate(self.features): pickle.dump(o, f) if verbose and i % check_interval == 0: log.info('%s %.1f%% saved.' % (filename, i / self.size * 100)) log.info('%s 100%% saved.' % filename)
def save(self, filepath: str, gzip_format=False, verbose=False): filename = os.path.basename(filepath) if gzip_format: f = gzip.open(filepath, 'wb') else: f = open(filepath, 'wb') with f: for o in [ self.name, self.size, self.features_vector, self.labels_vector ]: pickle.dump(o, f) check_interval = min(100000, math.ceil(self.size)) for i, (feature, label) in enumerate(zip(self.features, self.labels)): # print('save feature:', feature, 'label:', label) pickle.dump(feature, f) pickle.dump(label, f) if verbose and i % check_interval == 0: log.info('%s %.1f%% saved.' % (filename, i / self.size * 100)) log.info('%s 100%% saved.' % filename) log.info('shape: %s' % self.features.shape)
def collect_characters(sentences_file: str, characters_file: str, max_test: int = 0): """ 문장 파일을 읽어서, 유니크한 문자(음절)들을 추출 한다. 추후 corpus기반으로 one hot vector 생성시 사용한다. :param sentences_file: *.sentences file path :param characters_file: *.characters file path :param max_test: 0=run all :return: """ total = FileUtil.count_lines(sentences_file, gzip_format=True) log.info('total: %s' % NumUtil.comma_str(total)) char_set = set() with gzip.open(sentences_file, 'rt') as f: for i, sentence in enumerate(f): i += 1 if i % 10000 == 0: log.info( '%s %.1f%% writed.' % (os.path.basename(characters_file), i / total * 100)) _char_set = set([c for c in sentence]) char_set.update(_char_set) if 0 < max_test <= i: break char_list = list(char_set) char_list.sort() if max_test == 0: # 0=full with open(characters_file, 'w') as f: for c in char_list: f.write(c) f.write('\n') log.info('writed to %s OK.' % characters_file)
def load(cls, filepath: str, gzip_format=False, verbose=False): filename = os.path.basename(filepath) if gzip_format: f = gzip.open(filepath, 'rb') else: f = open(filepath, 'rb') with f: d = DataSet() d.name, d.size, d.features_vector, d.labels_vector, d.labels = \ pickle.load(f), pickle.load(f), pickle.load(f), pickle.load(f), pickle.load(f) check_interval = min(100000, math.ceil(d.size)) li = [] for i in range(d.size): li.append(pickle.load(f)) if verbose and i % check_interval == 0: log.info('%s %.1f%% loaded.' % (filename, i / d.size * 100)) log.info('%s 100%% loaded.' % filename) d.features = np.asarray(li) return d
def to_one_hot_vector(self, features_batch: np.ndarray, labels_batch: np.ndarray, verbose=False): _features, _labels = [], [] for i, (chars, has_space) in enumerate(zip(features_batch, labels_batch)): chars_v = self.features_vector.to_vectors(chars) feature = np.concatenate(chars_v) # concated feature label = self.labels_vector.to_vector(has_space) _features.append(feature) _labels.append(label) check_interval = min(1000, math.ceil(features_batch.shape[0])) if verbose and i % check_interval == 0: log.info( '[%s] to_one_hot_vector %s -> %s, %s (len=%s) %s (len=%s)' % (i, chars, label, feature, len(feature), label, len(label))) return np.asarray(_features, dtype=np.int32), np.asarray(_labels, dtype=np.int32)
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector, n_hidden1, learning_rate, dropout_keep_rate, early_stop_cost=0.001): n_features = len(features_vector) * window_size # number of features = 17,382 * 10 log.info('load characters list...') log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction', 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size)) log.info('train_file: %s' % train_file) log.info('valid_file: %s' % valid_file) log.info('test_file: %s' % test_file) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') # FIXME: out of memory (1M sentences) log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) log.info('noise_rate: %s' % noise_rate) features, labels = [], [] with gzip.open(data_file, 'rt') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) sentence = line.strip() for start in range(0, len(sentence) - window_size + 1): # 문자 단위로 노이즈(공백) 생성 chars = sentence[start: start + window_size] for idx in range(len(chars)): noised_chars = StringUtil.replace_with_index(chars, ' ', idx) features.append(noised_chars) labels.append(chars) log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # log.info('noise_sampling: %s' % noise_sampling) # for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성 # for start in range(0, len(sentence) - window_size + 1): # chars = sentence[start: start + window_size] # noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True) # if chars == noised_chars: # continue # if i % check_interval == 0 and nth_sample == 0: # log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars)) # features.append(noised_chars) # labels.append(chars) # print('dataset features:', features) # print('dataset labels:', labels) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1, learning_rate, watch) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('') log.info('learn...') log.info('total_epoch: %s' % total_epoch) log.info('train.size (total features): %s' % NumUtil.comma_str(train.size)) log.info('check_interval: %s' % check_interval) log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1): if valid_cost < early_stop_cost: break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate}) # if nth_train % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0}) log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.') log.info('') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('') log.info('total_epoch: %s' % total_epoch) log.info('batch_size: %s' % batch_size) log.info('total_input: %s (total_epoch * train.size)' % total_input) log.info('') log.info(watch.summary()) log.info('')
def build_DAE(cls, n_features, window_size, noise_rate, n_hidden1, learning_rate, watch=WatchUtil()): if len(cls.graph) == 0: log.info('') log.info('create tensorflow graph...') watch.start('create tensorflow graph') features_vector_size = n_features // window_size log.info('n_features: %s' % n_features) log.info('window_size: %s' % window_size) log.info('features_vector_size: %s' % features_vector_size) log.info('noise_rate: %.1f' % noise_rate) log.info('n_hidden1: %s' % n_hidden1) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # shape=(batch_size, window_size * feature_vector.size) Y = tf.placeholder(tf.float32, [None, n_features], name='Y') # shape=(batch_size, window_size * feature_vector.size) dropout_keep_prob = tf.placeholder(tf.float32) # layers = 3 # n_hidden2 = n_hidden1 # W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') # b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') # layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1') # layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout') # # W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2') # b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2') # layer2 = tf.nn.sigmoid(tf.matmul(layer1_dropout, W2) + b2, name='layer2') # layer2_dropout = tf.nn.dropout(layer2, dropout_keep_prob, name='layer2_dropout') # # W3 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W3') # b3 = tf.Variable(tf.random_normal([n_features]), name='b3') # y_hat = tf.add(tf.matmul(layer2_dropout, W3), b3, name='y_hat') # layers = 2 W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1') layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout') W2 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W2') b2 = tf.Variable(tf.random_normal([n_features]), name='b2') y_hat = tf.add(tf.matmul(layer1_dropout, W2), b2, name='y_hat') # shape=(batch_size, window_size * feature_vector.size) labels_hat = tf.reshape(y_hat, shape=(-1, window_size, features_vector_size)) # shape=(batch_size, window_size, feature_vector.size) labels = tf.reshape(Y, shape=(-1, window_size, features_vector_size)) # shape=(batch_size, window_size, feature_vector.size) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=labels_hat, labels=labels), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) accuracy = tf.reduce_mean(tf.cast(tf.abs(tf.nn.softmax(y_hat) - Y) < 0.1, dtype=tf.float32), name='accuracy') # log.debug('X:', X) # log.debug('Y:', Y) # log.debug('y_hat:', y_hat) # log.debug('labels_hat:', labels_hat) # log.debug('labels:', labels) # log.debug('cost:', cost) # log.debug('accuracy:', accuracy) watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.') log.info('') cls.graph = {'X': X, 'Y': Y, 'dropout_keep_prob': dropout_keep_prob, 'train_step': train_step, 'cost': cost, 'y_hat': y_hat, 'accuracy': accuracy, } return cls.graph['X'], cls.graph['Y'], cls.graph['dropout_keep_prob'], \ cls.graph['train_step'], cls.graph['cost'], cls.graph['y_hat'], cls.graph['accuracy']
if len(sentence) != len(sentence_hat): return sim, correct, total for a, b in zip(sentence, sentence_hat): if a == b: correct += 1 sim = correct / total return sim, correct, total if __name__ == '__main__': train_sentences_file = KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE valid_sentences_file = KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE test_sentences_file = KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE log.info('train_sentences_file: %s' % train_sentences_file) log.info('valid_sentences_file: %s' % valid_sentences_file) log.info('test_sentences_file: %s' % test_sentences_file) log.info('') characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('characters_file: %s' % characters_file) try: if len(sys.argv) == 4: n_train = int(sys.argv[1]) window_size = int(sys.argv[2]) noise_rate = float(sys.argv[3]) else: n_train, noise_rate, window_size = None, None, None if n_train is None or n_train == 0: # default
def learning(cls, sentences_file, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, max_sentences=0, learning_rate=0.01, layers=2): ngram = left_gram + right_gram n_features = len( features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len( labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join( KO_WIKIPEDIA_ORG_DATA_DIR, 'datasets', 'ko.wikipedia.org.dataset.sentences=%d.left=%d.right=%d.train.gz' % (max_sentences, left_gram, right_gram)) validation_file = train_file.replace('.train.', '.validation.') test_file = train_file.replace('.train.', '.test.') if not os.path.exists(train_file) or not os.path.exists( validation_file) or not os.path.exists(test_file): watch.start('create dataset') log.info('create dataset...') features, labels = [], [] check_interval = min(10000, math.ceil(max_sentences)) log.info('total: %s' % NumUtil.comma_str(max_sentences)) with gzip.open(sentences_file, 'rt') as f: for i, line in enumerate(f, 1): if max_sentences < i: break if i % check_interval == 0: log.info( 'create dataset... %.1f%% readed. data len: %s' % (i / max_sentences * 100, NumUtil.comma_str(len(features)))) _f, _l = WordSpacing.sentence2features_labels( line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name='all') log.info('dataset: %s' % dataset) log.info('create dataset OK.\n') watch.stop('create dataset') watch.start('dataset save') log.info('split to train, test, validation...') datasets = DataSets.to_datasets(dataset, test_rate=0.1, valid_rate=0.1, test_max=10000, valid_max=1000, shuffle=True) train, test, validation = datasets.train, datasets.test, datasets.validation log.info(train) log.info(test) log.info(validation) # log.info('%s %s' % (test.features[0], test.labels[0])) log.info('split to train, test, validation OK.\n') log.info('dataset save... %s' % train_file) train.save(train_file, verbose=True) # save as text log.info('dataset save OK.\n') log.info('dataset save... %s' % validation_file) validation = validation.convert_to_one_hot_vector( verbose=True) # save as vector validation.save(validation_file, verbose=True) log.info('dataset save OK.\n') log.info('dataset save... %s' % test_file) test = test.convert_to_one_hot_vector(verbose=True) test.save(test_file, verbose=True) # save as vector log.info('dataset save OK.\n') watch.stop('dataset save') else: watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, verbose=True) validation = DataSet.load(validation_file, verbose=True) test = DataSet.load(test_file, verbose=True) log.info(train) log.info(validation) log.info(test) log.info('dataset load OK.\n') watch.stop('dataset load') log.info('check samples...') for i, (features_batch, labels_batch) in enumerate( train.next_batch(batch_size=5, to_one_hot_vector=True), 1): if i > 2: break for a, b in zip(features_batch, labels_batch): feature, label = a, b _feature = feature.reshape((ngram, len(features_vector))) chars = ''.join(features_vector.to_values(_feature)) has_space = np.argmax(label) log.info('[%s] %s -> %s, %s (len=%s) %s (len=%s)' % (i, chars, has_space, feature, len(feature), label, len(label))) log.info('check samples OK.\n') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch, layers=layers) train_step, X, Y, cost, hypothesis, predicted, accuracy = graph[ 'train_step'], graph['X'], graph['Y'], graph['cost'], graph[ 'hypothesis'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) n_input = 0 log.info('total: %s' % NumUtil.comma_str(train.size)) log.info('learn...') watch.start('learn') for step, (features_batch, labels_batch) in enumerate( train.next_batch(batch_size=batch_size), 1): n_input += batch_size sess.run(train_step, feed_dict={ X: features_batch, Y: labels_batch }) log.info( '[%s][%.1f%%] validation cost: %.4f' % (NumUtil.comma_str(n_input), n_input / train.size * 100, sess.run(cost, feed_dict={ X: validation.features, Y: validation.labels }))) watch.stop('learn') log.info('learn OK.\n') log.info('evaluate...') watch.start('evaluate...') _hypothesis, _correct, _accuracy = sess.run( [hypothesis, predicted, accuracy], feed_dict={ X: test.features, Y: test.labels }) # Accuracy report watch.stop('evaluate...') log.info('evaluate OK.') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info(watch.summary()) # log.info('hypothesis: %s %s' % (_hypothesis.shape, _hypothesis)) # log.info('correct: %s %s' % (_correct.shape, _correct)) log.info('accuracy: %s %s' % (_accuracy.shape, _accuracy)) log.info('\n')
def create_graph(model_name, scope_name, first_pipeline, second_pipeline, verbose=False): """ create or reuse graph :param model_name: :param first_pipeline: :param second_pipeline: :param scope_name: :param verbose: print graph nodes :return: tensorflow graph nodes """ with tf.variable_scope('common'): # for reusing graph use_first_pipeline = tf.placeholder(dtype=bool) learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1') b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1') x, y = tf.cond(use_first_pipeline, lambda: first_pipeline, lambda: second_pipeline) y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat') cost = tf.reduce_mean(tf.square(y_hat - y), name='cost') train_step = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost, name='train_step') with tf.variable_scope(scope_name, reuse=None): _W1 = tf.summary.histogram(values=W1, name='_W1') _b1 = tf.summary.histogram(values=b1, name='_b1') _cost = tf.summary.scalar(tensor=cost, name='_cost') summary = tf.summary.merge([_W1, _b1, _cost], name='summary') # merge_all() if verbose: log.info('') log.info(x) log.info(W1) log.info(b1) log.info('') log.info(y) log.info(y_hat) log.info(cost) return x, y, learning_rate, use_first_pipeline, W1, b1, y_hat, cost, train_step, summary
if __name__ == '__main__': os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # ignore tensorflow warnings tf.logging.set_verbosity(tf.logging.ERROR) # ignore tensorflow info func = multiply n_features = 2 # x1, x2 n_classes = 1 # y digits = list(range(-99, 100, 1)) n_train, n_test = 4000, 100 # 1% of 200 * 200 x_data = np.random.choice(digits, (n_train + n_test, n_features), replace=True) y_data = func(x_data) x_train, x_test = x_data[:n_train], x_data[n_train:] y_train, y_test = y_data[:n_train], y_data[n_train:] log.info('') log.info('func: %s' % func.__name__) log.info('digits: %s ~ %s ' % (min(digits), max(digits))) log.info('x_train: %s' % str(x_train.shape)) log.info(x_data[:5]) log.info('y_train: %s' % str(y_train.shape)) log.info(y_data[:5]) log.info('x_test: %s' % str(x_test.shape)) log.info('y_test %s' % str(y_test.shape)) valid_check_interval = 0.5 bias_value = 0.0 early_stop_cost = 0.1 # stop learning # default values optimizer = tf.train.AdamOptimizer
import matplotlib.pyplot as plt import numpy as np import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data from bage_utils.base_util import is_server from nlp4kor.config import MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR, log if __name__ == '__main__': mnist_data = os.path.join(MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR) # input device2use = '/gpu:0' if is_server() else '/cpu:0' model_file = os.path.join(MNIST_DAE_MODEL_DIR, 'dae_mnist_model≤/model') # .%s' % max_sentences log.info('model_file: %s' % model_file) model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) image_shape = (28, 28) mnist = input_data.read_data_sets(mnist_data, one_hot=True) assert (mnist.train.images.shape[1] == mnist.test.images.shape[1]) n_input_dim = mnist.train.images.shape[ 1] # MNIST data input (img shape: 28*28) n_output_dim = n_input_dim # MNIST data input (img shape: 28*28) n_hidden_1 = 256 # 1st layer num features n_hidden_2 = 256 # 2nd layer num features log.info('n_input_dim: %s' % n_input_dim)
info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs)) info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs)) info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total)) info_f.write('train: %s\n' % NumUtil.comma_str(n_train)) info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid)) info_f.write('test: %s\n' % NumUtil.comma_str(n_test)) info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list))) if __name__ == '__main__': info_file = KO_WIKIPEDIA_ORG_INFO_FILE urls_file = KO_WIKIPEDIA_ORG_URLS_FILE sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('info_file: %s' % info_file) log.info('urls_file: %s' % urls_file) log.info('sentences_file: %s' % sentences_file) log.info('characters_file: %s' % characters_file) if not os.path.exists(characters_file) or not os.path.exists(sentences_file) or not os.path.exists(info_file) or not os.path.exists(urls_file): try: log.info('create senences file...') TextPreprocess.dump_corpus(MONGO_URL, db_name='parsed', collection_name='ko.wikipedia.org', sentences_file=sentences_file, characters_file=characters_file, info_file=info_file, urls_file=urls_file, train_sentences_file=KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, valid_sentences_file=KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, test_sentences_file=KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, mongo_query={}) # mongodb -> text file(corpus) log.info('create senences file OK')
total_spaces = labels1.count(1) # 정답에 있는 공백 개수 correct = total_spaces - incorrect # 정답에 있는 공백과 같은 곳에 공백이 있는지 if total_spaces == 0: sim = 1 else: sim = correct / total_spaces return sim, correct, total_spaces if __name__ == '__main__': train_sentences_file = KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE valid_sentences_file = KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE test_sentences_file = KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE log.info('train_sentences_file: %s' % train_sentences_file) log.info('valid_sentences_file: %s' % valid_sentences_file) log.info('test_sentences_file: %s' % test_sentences_file) characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('characters_file: %s' % characters_file) try: if len(sys.argv) == 4: n_train = int(sys.argv[1]) left_gram = int(sys.argv[2]) right_gram = int(sys.argv[3]) else: n_train, left_gram, right_gram = 1000000, 3, 3 # n_train, left_gram, right_gram = int('1,000,000'.replace(',', '')), 2, 2 if left_gram is None:
def create_graph(scope_name, input_len=2, output_len=1, verbose=False): """ create or reuse graph :param output_len: x1, x2 :param input_len: y :param scope_name: :param verbose: print graph nodes :return: tensorflow graph nodes """ with tf.variable_scope('common') as variable_scope: # for reusing graph learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') x = tf.placeholder(dtype=tf.float32, shape=[None, input_len], name='x') y = tf.placeholder(dtype=tf.float32, shape=[None, output_len], name='y') W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1') b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1') y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat') cost = tf.reduce_mean(tf.square(y_hat - y), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate, name='optimizer').minimize( cost, name='train_step') with tf.variable_scope(scope_name, reuse=None) as scope: _W1 = tf.summary.histogram(values=W1, name='_W1') _b1 = tf.summary.histogram(values=b1, name='_b1') _cost = tf.summary.scalar(tensor=cost, name='_cost') summary = tf.summary.merge([_W1, _b1, _cost]) if verbose: log.info('') log.info(x) log.info(W1) log.info(b1) log.info('') log.info(y) log.info(y_hat) log.info(cost) return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary
exit() config = tf.ConfigProto() config.gpu_options.allow_growth = True filenames = [data_file] features_batch, labels_batch = input_pipeline(filenames, batch_size=batch_size, shuffle=shuffle, tokens=2) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) coordinator = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) log.info('coordinator: %s' % coordinator) log.info('threads: %s, %s' % (len(threads), threads)) try: for nth_batch in range(5): if coordinator.should_stop(): break _features_batch, _labels_batch = sess.run( [features_batch, labels_batch]) log.info('') log.info('nth_batch: %s' % nth_batch) for _f, _l in zip(_features_batch, _labels_batch): log.info('%s %s' % (_f.decode('utf8'), _l.decode('utf8'))) # decode for print except: log.info(traceback.format_exc())
def dump_corpus(mongo_url, db_name, collection_name, sentences_file, characters_file, info_file, urls_file, train_sentences_file, valid_sentences_file, test_sentences_file, mongo_query=None, limit=None): """ Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.) :param characters_file: :param urls_file: :param info_file: :param mongo_url: mongodb://~~~ :param db_name: database name of mongodb :param collection_name: collection name of mongodb :param sentences_file: *.sentence file :param train_sentences_file: :param valid_sentences_file: :param test_sentences_file: :param mongo_query: default={} :param limit: :return: """ if mongo_query is None: mongo_query = {} corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name) total_docs = corpus_mongo.count() log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total_docs))) output_dir = os.path.basename(sentences_file) if not os.path.exists(output_dir): os.makedirs(output_dir) with gzip.open(sentences_file, 'wt') as out_f, \ gzip.open(train_sentences_file, 'wt') as train_f, \ gzip.open(valid_sentences_file, 'wt') as valid_f, \ gzip.open(test_sentences_file, 'wt') as test_f, \ open(info_file, 'wt') as info_f, \ open(urls_file, 'wt') as urls_f: char_set = set() n_docs = n_total = n_train = n_valid = n_test = 0 if limit: cursor = corpus_mongo.find(mongo_query, limit=limit) else: cursor = corpus_mongo.find(mongo_query) for i, row in enumerate(cursor, 1): if i % 1000 == 0: log.info('%s %.1f%% writed.' % (os.path.basename(sentences_file), i / total_docs * 100)) sentences = [] for c in row['content']: sentences.extend(HangulUtil.text2sentences(c['sentences'], remove_only_one_word=True, has_hangul=True)) # sentences = HangulUtil.text2sentences(row['content'], remove_only_one_word=True, has_hangul=True) log.debug('url: %s, len: %s' % (row['url'], len(sentences))) if len(sentences) == 0: # log.error(row['content']) continue urls_f.write(row['url']) urls_f.write('\n') n_docs += 1 for s in sentences: _char_set = set([c for c in s]) char_set.update(_char_set) n_total += 1 out_f.write(s) out_f.write('\n') if len(sentences) >= 10: # can split test_len = valid_len = len(sentences) // 10 # log.info('train: %s, test: %s, valid: %s' % (len(sentences) - test_len - valid_len, test_len, valid_len)) for s in sentences[:test_len]: n_test += 1 test_f.write(s) test_f.write('\n') for s in sentences[test_len:test_len + valid_len]: n_valid += 1 valid_f.write(s) valid_f.write('\n') for s in sentences[test_len + valid_len:]: n_train += 1 train_f.write(s) train_f.write('\n') else: # can't split for s in sentences: n_train += 1 train_f.write(s) train_f.write('\n') char_list = list(char_set) char_list.sort() log.info('writed to %s...' % characters_file) with open(characters_file, 'w') as f: for c in char_list: f.write(c) f.write('\n') log.info('writed to %s OK.' % characters_file) log.info('total docs: %s', NumUtil.comma_str(total_docs)) log.info('total docs: %s (has hangul sentence)', NumUtil.comma_str(n_docs)) log.info('total sentences: %s (has hangul sentence)', NumUtil.comma_str(n_total)) log.info('train: %s', NumUtil.comma_str(n_train)) log.info('valid: %s', NumUtil.comma_str(n_valid)) log.info('test: %s', NumUtil.comma_str(n_test)) log.info('total characters: %s', NumUtil.comma_str(len(char_list))) info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs)) info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs)) info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total)) info_f.write('train: %s\n' % NumUtil.comma_str(n_train)) info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid)) info_f.write('test: %s\n' % NumUtil.comma_str(n_test)) info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))
if l == 1 and labels2[idx] != 1: incorrect += 1 total_spaces = labels1.count(1) # 정답에 있는 공백 개수 correct = total_spaces - incorrect # 정답에 있는 공백과 같은 곳에 공백이 있는지 if total_spaces == 0: sim = 1 else: sim = correct / total_spaces return sim, correct, total_spaces if __name__ == '__main__': sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE log.info('sentences_file: %s' % sentences_file) characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE log.info('characters_file: %s' % characters_file) try: if len(sys.argv) == 4: max_sentences = int(sys.argv[1]) left_gram = int(sys.argv[2]) right_gram = int(sys.argv[3]) else: max_sentences, left_gram, right_gram = None, None, None if max_sentences is None: max_sentences = int('1,000,000'.replace(',', '')) # max_sentences = int('1,000,000'.replace(',', '')) if is_my_pc() else int('1,000,000'.replace(',', '')) # run 100 or 1M data (학습: 17시간 소요) # max_sentences = int('1,000,000'.replace(',', '')) if is_my_pc() else FileUtil.count_lines(sentences_file, gzip_format=True) # run 100 or full data (학습시간: 5일 소요)
def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100, learning_rate=0.01, early_stop_cost=0.001): ngram = left_gram + right_gram n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 log.info('load characters list...') log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector))) watch = WatchUtil() train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram)) valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram)) test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing', 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram)) log.info('train_file: %s' % train_file) log.info('valid_file: %s' % valid_file) log.info('test_file: %s' % test_file) if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file): dataset_dir = os.path.dirname(train_file) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) watch.start('create dataset') log.info('create dataset...') data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False), ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False), ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False)) for name, data_file, total, dataset_file, to_one_hot_vector in data_files: check_interval = 10000 log.info('check_interval: %s' % check_interval) log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total))) features, labels = [], [] with gzip.open(data_file, 'rt') as f: for i, line in enumerate(f, 1): if total < i: break if i % check_interval == 0: time.sleep(0.01) # prevent cpu overload percent = i / total * 100 log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file)) _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram) features.extend(_f) labels.extend(_l) dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name) log.info('dataset save... %s' % dataset_file) dataset.save(dataset_file, gzip_format=True, verbose=True) log.info('dataset save OK. %s' % dataset_file) log.info('dataset: %s' % dataset) log.info('create dataset OK.') log.info('') watch.stop('create dataset') watch.start('dataset load') log.info('dataset load...') train = DataSet.load(train_file, gzip_format=True, verbose=True) if n_train >= int('100,000'.replace(',', '')): valid = DataSet.load(valid_file, gzip_format=True, verbose=True) else: valid = DataSet.load(train_file, gzip_format=True, verbose=True) log.info('valid.convert_to_one_hot_vector()...') valid = valid.convert_to_one_hot_vector(verbose=True) log.info('valid.convert_to_one_hot_vector() OK.') log.info('train dataset: %s' % train) log.info('valid dataset: %s' % valid) log.info('dataset load OK.') log.info('') watch.stop('dataset load') graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch) train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy'] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_interval = 10 # max(1, min(1000, n_train // 10)) nth_train, nth_input, total_input = 0, 0, total_epoch * train.size log.info('learn...') log.info('total: %s' % NumUtil.comma_str(train.size)) watch.start('learn') valid_cost = sys.float_info.max for epoch in range(1, total_epoch + 1): if valid_cost < early_stop_cost: break for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1): if valid_cost < early_stop_cost: log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost)) break nth_train += 1 nth_input += features_batch.shape[0] sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch}) # if step % check_interval == 1: percent = nth_input / total_input * 100 valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels}) log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost)) watch.stop('learn') log.info('learn OK.\n') log.info('model save... %s' % model_file) watch.start('model save...') model_dir = os.path.dirname(model_file) if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver() saver.save(sess, model_file) watch.stop('model save...') log.info('model save OK. %s' % model_file) log.info('\n') log.info('batch_size: %s' % batch_size) log.info(watch.summary()) log.info('\n')
output_len = 1 # y _learning_rate = 0.01 n_train, n_valid, n_test = 1000, 100, 10 if not os.path.exists(train_file): create_data4add(train_file, n_train, digit_max=99) if not os.path.exists(valid_file): create_data4add(valid_file, n_valid, digit_max=99) if not os.path.exists(test_file): create_data4add(test_file, n_test, digit_max=99) for training_mode in [True, False]: # training & testing for batch_size in [1, 10, 100]: tf.reset_default_graph( ) # Clears the default graph stack and resets the global default graph. log.info('') log.info( 'training_mode: %s, batch_size: %s, total_train_time: %s secs' % (training_mode, batch_size, total_train_time)) model_name = os.path.basename(__file__).replace('.py', '') model_file = os.path.join( MODELS_DIR, '%s.n_train_%s.batch_size_%s.total_train_time_%s/model' % (model_name, n_train, batch_size, total_train_time)) model_dir = os.path.dirname(model_file) log.info('model_name: %s' % model_name) log.info('model_file: %s' % model_file) scope_name = '%s.%s.batch_size_%s.total_train_time_%s' % ( model_name, DateUtil.current_yyyymmdd_hhmm(), batch_size,
def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()): # TODO: 2 layers log.info('\nbuild_FFNN') if len(cls.graph_nodes) == 0: n_hidden3 = n_hidden2 = n_hidden1 log.info('create tensorflow graph...') watch.start('create tensorflow graph') log.info('n_features: %s' % n_features) log.info('n_classes: %s' % n_classes) log.info('n_hidden1: %s' % n_hidden1) log.info('n_hidden2: %s' % n_hidden2) log.info('n_hidden3: %s' % n_hidden3) tf.set_random_seed(777) # for reproducibility X = tf.placeholder(tf.float32, [None, n_features], name='X') # two characters Y = tf.placeholder(tf.float32, [None, n_classes], name='Y') # W1 = tf.Variable(tf.truncated_normal([n_features, n_hidden1], mean=0.0, stddev=0.1), name='W1') # b1 = tf.Variable(tf.constant(0.1, shape=[n_hidden1]), name='b1') W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1') b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1') layer1 = tf.nn.relu(tf.matmul(X, W1) + b1, name='layer1') W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2') b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2') layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2, name='layer2') W3 = tf.Variable(tf.random_normal([n_hidden2, n_hidden3]), name='W3') b3 = tf.Variable(tf.random_normal([n_hidden3]), name='b3') layer3 = tf.nn.relu(tf.matmul(layer2, W3) + b3, name='layer3') W4 = tf.Variable(tf.random_normal([n_hidden3, n_classes]), name='W4') b4 = tf.Variable(tf.random_normal([n_classes]), name='b4') y_hat = tf.add(tf.matmul(layer3, W4), b4, name='y_hat') # cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost') # cost/loss function cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=Y), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( cost) # Very Very good!! sentences=10000 + layer=4, 10분, accuracy 0.9294, cost: 0.1839 predicted = tf.cast(y_hat > 0.5, dtype=tf.float32, name='predicted') # 0 <= hypothesis <= 1 accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy') watch.stop('create tensorflow graph') log.info('create tensorflow graph OK.\n') cls.graph_nodes = {'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost} return cls.graph_nodes
def create_graph(scope_name, mode, input_file, input_len=2, output_len=1, batch_size=1, verbose=False, reuse=None, n_threads=2): """ create or reuse graph :param scope_name: variable scope name :param mode: 'train', 'valid', 'test' :param input_file: train or valid or test file path :param input_len: x1, x2 :param output_len: y :param batch_size: batch size > 0 :param verbose: print graph nodes :param reuse: reuse graph or not :param n_threads: number of example enqueue threands (2 is enough) :return: tensorflow graph nodes """ with tf.variable_scope('common', reuse=reuse): # for reusing graph W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1') b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1') learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') with tf.variable_scope(mode, reuse=None): x, y = input_pipeline([input_file], batch_size=batch_size, delim='\t', splits=3, n_threads=n_threads) y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat') cost = tf.reduce_mean(tf.square(y_hat - y), name='cost') train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, name='train_step') with tf.variable_scope(scope_name, reuse=None): _W1 = tf.summary.histogram(values=W1, name='_W1') _b1 = tf.summary.histogram(values=b1, name='_b1') _cost = tf.summary.scalar(tensor=cost, name='_cost') summary = tf.summary.merge([_W1, _b1, _cost], name='summary') # merge_all() if verbose: log.info('') log.info(x) log.info(W1) log.info(b1) log.info('') log.info(y) log.info(y_hat) log.info(cost) return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary