示例#1
0
        log.info('n_valid: %s' % NumUtil.comma_str(n_valid))
        log.info('n_test: %s' % NumUtil.comma_str(n_test))
        log.info('left_gram: %s, right_gram: %s' % (left_gram, right_gram))
        log.info('ngram: %s' % ngram)

        total_sentences = FileUtil.count_lines(KO_WIKIPEDIA_ORG_SENTENCES_FILE)
        model_file = os.path.join(KO_WIKIPEDIA_ORG_WORD_SPACING_MODEL_DIR,
                                  'word_spacing_model.sentences=%s.left_gram=%s.right_gram=%s/model' % (
                                      n_train, left_gram, right_gram))  # .%s' % max_sentences
        log.info('model_file: %s' % model_file)

        batch_size = 500  # mini batch size
        log.info('batch_size: %s' % batch_size)

        total_epoch = 100  # min(100, 1000000 // n_train)  # 1 ~ 100
        features_vector = CharOneHotVector(DataFileUtil.read_list(characters_file))
        labels_vector = CharOneHotVector([0, 1])  # 붙여쓰기=0, 띄어쓰기=1
        n_features = len(features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1  # number of classes = 2 but len=1
        n_hidden1 = 100
        learning_rate = 0.001  # min(0.1, 0.001 * total_epoch)  # 0.1 ~ 0.001
        early_stop_cost = 0.0001
        log.info('features_vector: %s' % features_vector)
        log.info('labels_vector: %s' % labels_vector)
        log.info('n_features: %s' % n_features)
        log.info('n_classes: %s' % n_classes)
        log.info('n_hidden1: %s' % n_hidden1)
        log.info('learning_rate: %s' % learning_rate)
        log.info('early_stop_cost: %s' % early_stop_cost)

        log.info('sample testing...')
示例#2
0
 def from_file(characters_file):
     chars = DataFileUtil.read_list(characters_file)
     return CharVocab(chars)