log.info('n_valid: %s' % NumUtil.comma_str(n_valid)) log.info('n_test: %s' % NumUtil.comma_str(n_test)) log.info('left_gram: %s, right_gram: %s' % (left_gram, right_gram)) log.info('ngram: %s' % ngram) total_sentences = FileUtil.count_lines(KO_WIKIPEDIA_ORG_SENTENCES_FILE) model_file = os.path.join(KO_WIKIPEDIA_ORG_WORD_SPACING_MODEL_DIR, 'word_spacing_model.sentences=%s.left_gram=%s.right_gram=%s/model' % ( n_train, left_gram, right_gram)) # .%s' % max_sentences log.info('model_file: %s' % model_file) batch_size = 500 # mini batch size log.info('batch_size: %s' % batch_size) total_epoch = 100 # min(100, 1000000 // n_train) # 1 ~ 100 features_vector = CharOneHotVector(DataFileUtil.read_list(characters_file)) labels_vector = CharOneHotVector([0, 1]) # 붙여쓰기=0, 띄어쓰기=1 n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 n_hidden1 = 100 learning_rate = 0.001 # min(0.1, 0.001 * total_epoch) # 0.1 ~ 0.001 early_stop_cost = 0.0001 log.info('features_vector: %s' % features_vector) log.info('labels_vector: %s' % labels_vector) log.info('n_features: %s' % n_features) log.info('n_classes: %s' % n_classes) log.info('n_hidden1: %s' % n_hidden1) log.info('learning_rate: %s' % learning_rate) log.info('early_stop_cost: %s' % early_stop_cost) log.info('sample testing...')
def from_file(characters_file): chars = DataFileUtil.read_list(characters_file) return CharVocab(chars)