예제 #1
0
    def dump_corpus(mongo_url, db_name, collection_name, sentences_file, mongo_query=None, limit=None):
        """
        Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.)
        :param mongo_url: mongodb://~~~
        :param db_name: database name of mongodb
        :param collection_name: collection name of mongodb
        :param sentences_file: *.sentence file
        :param mongo_query: default={}
        :param limit:
        :return:
        """
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name)
        total = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total)))

        output_dir = os.path.basename(sentences_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with gzip.open(sentences_file, 'wt') as out_f:
            for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)):
                # print('url:', row['url'])
                for c in row['content']:
                    if i % 1000 == 0:
                        print('%.1f%% writed.' % (i / total * 100))
                    for s in HangulUtil.text2sentences(c['sentences']):
                        if HangulUtil.has_hangul(s):
                            out_f.write(s)
                            out_f.write('\n')
예제 #2
0
    def __to_one_hot_vector(self,
                            features_batch: np.ndarray,
                            labels_batch: np.ndarray,
                            verbose=False):
        _features, _labels = [], []
        check_interval = min(1000, math.ceil(features_batch.shape[0]))
        for i, (feature_string,
                label_string) in enumerate(zip(features_batch, labels_batch)):
            if isinstance(feature_string, str) or isinstance(
                    feature_string, list):
                feature_v = self.features_vector.to_vectors(
                    feature_string)  # to 2 dim
                feature = np.concatenate(feature_v)  # to 1 dim
            else:
                feature = self.features_vector.to_vector(
                    feature_string)  # to 1 dim

            if isinstance(label_string, str) or isinstance(label_string, list):
                label_v = self.labels_vector.to_vectors(
                    label_string)  # to 2 dim
                label = np.concatenate(label_v)  # to 1 dim
            else:
                label = self.labels_vector.to_vector(label_string)  # to 1 dim

            _features.append(feature)
            _labels.append(label)

            if verbose and i % check_interval == 0:
                log.info(
                    '[%s] to_one_hot_vector %s -> %s, %s (len=%s) %s (len=%s)'
                    % (i, feature_string, label, feature, len(feature), label,
                       len(label)))
        return np.asarray(_features,
                          dtype=np.int32), np.asarray(_labels, dtype=np.int32)
예제 #3
0
 def encode_noise(cls,
                  s,
                  noise_rate=0.1,
                  noise_with_blank=False,
                  verbose=False):
     try:
         hangul_indexs = [
             idx for idx, c in enumerate(s) if HangulUtil.is_hangul_char(c)
         ]
         if len(hangul_indexs) == 0:
             return s
         target_indexs = np.random.choice(
             hangul_indexs,
             math.ceil(len(hangul_indexs) * noise_rate),
             replace=False)
         _s = list(s)
         for idx in target_indexs:
             if noise_with_blank:
                 _s[idx] = ' '
             else:
                 c = s[idx]
                 _c = HangulUtil.encode_noise(c)
                 if verbose:
                     log.info('encode: %s -> %s' % (c, _c))
                 _s[idx] = _c
         return ''.join(_s)
     except:
         return s
예제 #4
0
    def dump_urls(mongo_url,
                  db_name,
                  collection_name,
                  urls_file,
                  mongo_query=None,
                  limit=0):
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url,
                                   db_name=db_name,
                                   collection_name=collection_name)
        total = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total)))

        output_dir = os.path.basename(urls_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(urls_file, 'wt') as out_f:
            for i, row in enumerate(corpus_mongo.find(mongo_query,
                                                      limit=limit)):
                if i % 1000 == 0:
                    log.info('%s %.1f%% writed.' %
                             (os.path.basename(urls_file), i / total * 100))
                    out_f.write(row['url'])
                    out_f.write('\n')
예제 #5
0
    def decode_noise(cls,
                     noised_sentence,
                     features_list,
                     labels_list,
                     verbose=False):
        try:
            if len(features_list) != len(labels_list) or len(
                    features_list[0]) != len(labels_list[0]):
                return noised_sentence

            idx2chars = dict()
            for feature, label in zip(features_list, labels_list):
                for off in [
                        i for i in range(len(feature))
                        if feature[i] != label[i]
                ]:
                    for start in [
                            m.start()
                            for m in re.finditer(feature, noised_sentence)
                    ]:
                        idx = start + off
                        if idx not in idx2chars:
                            idx2chars[idx] = label[off]
            sentence = list(noised_sentence)
            for idx, char in idx2chars.items():
                if verbose:
                    log.info('denoise: "%s" -> "%s"' %
                             (noised_sentence[idx], char))
                sentence[idx] = char
            return ''.join(sentence)
        except:
            return noised_sentence
예제 #6
0
    def __build_FFNN_layers2(cls,
                             n_features,
                             n_classes,
                             n_hidden1,
                             learning_rate,
                             watch=WatchUtil()):
        if len(cls.graph_nodes) == 0:
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')
            log.info('n_features: %s' % n_features)
            log.info('n_classes: %s' % n_classes)
            log.info('n_hidden1: %s' % n_hidden1)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features],
                               name='X')  # two characters
            Y = tf.placeholder(tf.float32, [None, n_classes], name='Y')

            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]),
                             name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.sigmoid(tf.matmul(X, W1) + b1, name='layer1')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_classes]),
                             name='W2')
            b2 = tf.Variable(tf.random_normal([n_classes]), name='b2')
            hypothesis = tf.sigmoid(tf.matmul(layer1, W2) + b2,
                                    name='hypothesis')

            cost = -tf.reduce_mean(Y * tf.log(hypothesis) +
                                   (1 - Y) * tf.log(1 - hypothesis),
                                   name='cost')  # cost/loss function

            # train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)  # Too bad. sentences=10000 + layer=2, 20분, Accuracy: 0.689373, cost: 0.8719
            train_step = tf.train.AdamOptimizer(
                learning_rate=learning_rate
            ).minimize(
                cost
            )  # Very good!! sentences=10000 + layer=2, 10분, accuracy 0.9194, cost: 0.2139

            predicted = tf.cast(hypothesis > 0.5,
                                dtype=tf.float32,
                                name='predicted')  # 0 <= hypothesis <= 1
            accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y),
                                              dtype=tf.float32),
                                      name='accuracy')
            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.\n')
            cls.graph_nodes = {
                'hypothesis': hypothesis,
                'predicted': predicted,
                'accuracy': accuracy,
                'X': X,
                'Y': Y,
                'train_step': train_step,
                'cost': cost
            }
        return cls.graph_nodes
예제 #7
0
    def load(cls, filepath: str, gzip_format=False, max_len=0, verbose=False):
        filename = os.path.basename(filepath)
        if gzip_format:
            f = gzip.open(filepath, 'rb')
        else:
            f = open(filepath, 'rb')

        with f:
            d = DataSet()
            d.name, d.size, d.features_vector, d.labels_vector = pickle.load(
                f), pickle.load(f), pickle.load(f), pickle.load(f)

            check_interval = min(100000, math.ceil(d.size))
            features, labels = [], []
            for i in range(d.size):
                if 0 < max_len <= len(features):
                    break
                feature, label = pickle.load(f), pickle.load(f)
                # print('load feature:', feature, 'label:', label)
                features.append(feature)
                labels.append(label)
                if verbose and i % check_interval == 0:
                    log.info('%s %.1f%% loaded.' %
                             (filename, i / d.size * 100))
            log.info('%s 100%% loaded.' % filename)
            d.features = np.asarray(features)
            d.labels = np.asarray(labels)
            log.info('%s features shape: %s' % (filename, d.features.shape))
            log.info('%s labels shape: %s' % (filename, d.features.shape))
        return d
예제 #8
0
 def build_FFNN(cls,
                n_features,
                n_classes,
                n_hidden1,
                learning_rate,
                watch=WatchUtil(),
                layers=4):
     log.info('\nbuild_FFNN(layers=%s)' % layers)
     if layers == 2:
         return cls.__build_FFNN_layers2(n_features,
                                         n_classes,
                                         n_hidden1,
                                         learning_rate,
                                         watch=watch)
     else:
         return cls.__build_FFNN_layers4(n_features,
                                         n_classes,
                                         n_hidden1,
                                         learning_rate,
                                         watch=watch)
예제 #9
0
파일: dataset.py 프로젝트: calc2te/nlp4kor
    def save(self, filepath: str, gzip_format=False, verbose=False):
        filename = os.path.basename(filepath)
        if gzip_format:
            f = gzip.open(filepath, 'wb')
        else:
            f = open(filepath, 'wb')

        with f:
            for o in [
                    self.name, self.size, self.features_vector,
                    self.labels_vector, self.labels
            ]:
                pickle.dump(o, f)

            check_interval = min(100000, math.ceil(self.size))
            for i, o in enumerate(self.features):
                pickle.dump(o, f)
                if verbose and i % check_interval == 0:
                    log.info('%s %.1f%% saved.' %
                             (filename, i / self.size * 100))
            log.info('%s 100%% saved.' % filename)
예제 #10
0
    def save(self, filepath: str, gzip_format=False, verbose=False):
        filename = os.path.basename(filepath)
        if gzip_format:
            f = gzip.open(filepath, 'wb')
        else:
            f = open(filepath, 'wb')

        with f:
            for o in [
                    self.name, self.size, self.features_vector,
                    self.labels_vector
            ]:
                pickle.dump(o, f)

            check_interval = min(100000, math.ceil(self.size))
            for i, (feature,
                    label) in enumerate(zip(self.features, self.labels)):
                # print('save feature:', feature, 'label:', label)
                pickle.dump(feature, f)
                pickle.dump(label, f)
                if verbose and i % check_interval == 0:
                    log.info('%s %.1f%% saved.' %
                             (filename, i / self.size * 100))

            log.info('%s 100%% saved.' % filename)
            log.info('shape: %s' % self.features.shape)
예제 #11
0
    def collect_characters(sentences_file: str,
                           characters_file: str,
                           max_test: int = 0):
        """
        문장 파일을 읽어서, 유니크한 문자(음절)들을 추출 한다.
        추후 corpus기반으로 one hot vector 생성시 사용한다.
        :param sentences_file: *.sentences file path 
        :param characters_file: *.characters file path
        :param max_test: 0=run all 
        :return: 
        """
        total = FileUtil.count_lines(sentences_file, gzip_format=True)
        log.info('total: %s' % NumUtil.comma_str(total))

        char_set = set()
        with gzip.open(sentences_file, 'rt') as f:
            for i, sentence in enumerate(f):
                i += 1
                if i % 10000 == 0:
                    log.info(
                        '%s %.1f%% writed.' %
                        (os.path.basename(characters_file), i / total * 100))
                _char_set = set([c for c in sentence])
                char_set.update(_char_set)
                if 0 < max_test <= i:
                    break

        char_list = list(char_set)
        char_list.sort()
        if max_test == 0:  # 0=full
            with open(characters_file, 'w') as f:
                for c in char_list:
                    f.write(c)
                    f.write('\n')
                log.info('writed to %s OK.' % characters_file)
예제 #12
0
파일: dataset.py 프로젝트: calc2te/nlp4kor
    def load(cls, filepath: str, gzip_format=False, verbose=False):
        filename = os.path.basename(filepath)
        if gzip_format:
            f = gzip.open(filepath, 'rb')
        else:
            f = open(filepath, 'rb')

        with f:
            d = DataSet()
            d.name, d.size, d.features_vector, d.labels_vector, d.labels = \
                pickle.load(f), pickle.load(f), pickle.load(f), pickle.load(f), pickle.load(f)

            check_interval = min(100000, math.ceil(d.size))
            li = []
            for i in range(d.size):
                li.append(pickle.load(f))
                if verbose and i % check_interval == 0:
                    log.info('%s %.1f%% loaded.' %
                             (filename, i / d.size * 100))
            log.info('%s 100%% loaded.' % filename)
            d.features = np.asarray(li)
        return d
예제 #13
0
파일: dataset.py 프로젝트: calc2te/nlp4kor
    def to_one_hot_vector(self,
                          features_batch: np.ndarray,
                          labels_batch: np.ndarray,
                          verbose=False):
        _features, _labels = [], []
        for i, (chars,
                has_space) in enumerate(zip(features_batch, labels_batch)):
            chars_v = self.features_vector.to_vectors(chars)
            feature = np.concatenate(chars_v)  # concated feature

            label = self.labels_vector.to_vector(has_space)
            _features.append(feature)
            _labels.append(label)

            check_interval = min(1000, math.ceil(features_batch.shape[0]))
            if verbose and i % check_interval == 0:
                log.info(
                    '[%s] to_one_hot_vector %s -> %s, %s (len=%s) %s (len=%s)'
                    % (i, chars, label, feature, len(feature), label,
                       len(label)))
        return np.asarray(_features,
                          dtype=np.int32), np.asarray(_labels, dtype=np.int32)
예제 #14
0
    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector,
                 n_hidden1,
                 learning_rate,
                 dropout_keep_rate, early_stop_cost=0.001):
        n_features = len(features_vector) * window_size  # number of features = 17,382 * 10

        log.info('load characters list...')
        log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                  'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                  'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size))

        log.info('train_file: %s' % train_file)
        log.info('valid_file: %s' % valid_file)
        log.info('test_file: %s' % test_file)
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')  # FIXME: out of memory (1M sentences)
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))
                log.info('noise_rate: %s' % noise_rate)

                features, labels = [], []
                with gzip.open(data_file, 'rt') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        sentence = line.strip()
                        for start in range(0, len(sentence) - window_size + 1):  # 문자 단위로 노이즈(공백) 생성
                            chars = sentence[start: start + window_size]
                            for idx in range(len(chars)):
                                noised_chars = StringUtil.replace_with_index(chars, ' ', idx)
                                features.append(noised_chars)
                                labels.append(chars)
                                log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars))

                # log.info('noise_sampling: %s' % noise_sampling)
                #         for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성
                #             for start in range(0, len(sentence) - window_size + 1):
                #                 chars = sentence[start: start + window_size]
                #                 noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True)
                #                 if chars == noised_chars:
                #                     continue
                #                 if i % check_interval == 0 and nth_sample == 0:
                #                     log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars))
                #                 features.append(noised_chars)
                #                 labels.append(chars)

                # print('dataset features:', features)
                # print('dataset labels:', labels)
                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1,
                                                                                                       learning_rate, watch)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('')
            log.info('learn...')
            log.info('total_epoch: %s' % total_epoch)
            log.info('train.size (total features): %s' % NumUtil.comma_str(train.size))
            log.info('check_interval: %s' % check_interval)
            log.info('total_epoch: %s' % total_epoch)
            log.info('batch_size: %s' % batch_size)
            log.info('total_input: %s (total_epoch * train.size)' % total_input)
            log.info('')
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1):
                    if valid_cost < early_stop_cost:
                        break

                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate})

                    # if nth_train % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost))

            watch.stop('learn')
            log.info('learn OK.')
            log.info('')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('')
        log.info('total_epoch: %s' % total_epoch)
        log.info('batch_size: %s' % batch_size)
        log.info('total_input: %s (total_epoch * train.size)' % total_input)
        log.info('')
        log.info(watch.summary())
        log.info('')
예제 #15
0
    def build_DAE(cls, n_features, window_size, noise_rate, n_hidden1, learning_rate, watch=WatchUtil()):
        if len(cls.graph) == 0:
            log.info('')
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')

            features_vector_size = n_features // window_size
            log.info('n_features: %s' % n_features)
            log.info('window_size: %s' % window_size)
            log.info('features_vector_size: %s' % features_vector_size)

            log.info('noise_rate: %.1f' % noise_rate)
            log.info('n_hidden1: %s' % n_hidden1)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features], name='X')  # shape=(batch_size, window_size * feature_vector.size)
            Y = tf.placeholder(tf.float32, [None, n_features], name='Y')  # shape=(batch_size, window_size * feature_vector.size)
            dropout_keep_prob = tf.placeholder(tf.float32)

            # layers = 3
            # n_hidden2 = n_hidden1
            # W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            # b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            # layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1')
            # layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout')
            #
            # W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2')
            # b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2')
            # layer2 = tf.nn.sigmoid(tf.matmul(layer1_dropout, W2) + b2, name='layer2')
            # layer2_dropout = tf.nn.dropout(layer2, dropout_keep_prob, name='layer2_dropout')
            #
            # W3 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W3')
            # b3 = tf.Variable(tf.random_normal([n_features]), name='b3')
            # y_hat = tf.add(tf.matmul(layer2_dropout, W3), b3, name='y_hat')

            # layers = 2
            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1, name='layer1')
            layer1_dropout = tf.nn.dropout(layer1, dropout_keep_prob, name='layer1_dropout')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_features]), name='W2')
            b2 = tf.Variable(tf.random_normal([n_features]), name='b2')
            y_hat = tf.add(tf.matmul(layer1_dropout, W2), b2, name='y_hat')  # shape=(batch_size, window_size * feature_vector.size)

            labels_hat = tf.reshape(y_hat, shape=(-1, window_size, features_vector_size))  # shape=(batch_size, window_size, feature_vector.size)
            labels = tf.reshape(Y, shape=(-1, window_size, features_vector_size))  # shape=(batch_size, window_size, feature_vector.size)

            cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=labels_hat, labels=labels), name='cost')
            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

            accuracy = tf.reduce_mean(tf.cast(tf.abs(tf.nn.softmax(y_hat) - Y) < 0.1, dtype=tf.float32), name='accuracy')
            # log.debug('X:', X)
            # log.debug('Y:', Y)
            # log.debug('y_hat:', y_hat)
            # log.debug('labels_hat:', labels_hat)
            # log.debug('labels:', labels)
            # log.debug('cost:', cost)
            # log.debug('accuracy:', accuracy)

            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.')
            log.info('')
            cls.graph = {'X': X, 'Y': Y, 'dropout_keep_prob': dropout_keep_prob,
                         'train_step': train_step, 'cost': cost, 'y_hat': y_hat, 'accuracy': accuracy, }
        return cls.graph['X'], cls.graph['Y'], cls.graph['dropout_keep_prob'], \
               cls.graph['train_step'], cls.graph['cost'], cls.graph['y_hat'], cls.graph['accuracy']
예제 #16
0
        if len(sentence) != len(sentence_hat):
            return sim, correct, total

        for a, b in zip(sentence, sentence_hat):
            if a == b:
                correct += 1

        sim = correct / total
        return sim, correct, total


if __name__ == '__main__':
    train_sentences_file = KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE
    valid_sentences_file = KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE
    test_sentences_file = KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE
    log.info('train_sentences_file: %s' % train_sentences_file)
    log.info('valid_sentences_file: %s' % valid_sentences_file)
    log.info('test_sentences_file: %s' % test_sentences_file)
    log.info('')

    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('characters_file: %s' % characters_file)
    try:
        if len(sys.argv) == 4:
            n_train = int(sys.argv[1])
            window_size = int(sys.argv[2])
            noise_rate = float(sys.argv[3])
        else:
            n_train, noise_rate, window_size = None, None, None

        if n_train is None or n_train == 0:  # default
예제 #17
0
    def learning(cls,
                 sentences_file,
                 batch_size,
                 left_gram,
                 right_gram,
                 model_file,
                 features_vector,
                 labels_vector,
                 n_hidden1=100,
                 max_sentences=0,
                 learning_rate=0.01,
                 layers=2):
        ngram = left_gram + right_gram
        n_features = len(
            features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(
            labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' %
                 NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(
            KO_WIKIPEDIA_ORG_DATA_DIR, 'datasets',
            'ko.wikipedia.org.dataset.sentences=%d.left=%d.right=%d.train.gz' %
            (max_sentences, left_gram, right_gram))
        validation_file = train_file.replace('.train.', '.validation.')
        test_file = train_file.replace('.train.', '.test.')
        if not os.path.exists(train_file) or not os.path.exists(
                validation_file) or not os.path.exists(test_file):
            watch.start('create dataset')
            log.info('create dataset...')
            features, labels = [], []
            check_interval = min(10000, math.ceil(max_sentences))
            log.info('total: %s' % NumUtil.comma_str(max_sentences))

            with gzip.open(sentences_file, 'rt') as f:
                for i, line in enumerate(f, 1):
                    if max_sentences < i:
                        break

                    if i % check_interval == 0:
                        log.info(
                            'create dataset... %.1f%% readed. data len: %s' %
                            (i / max_sentences * 100,
                             NumUtil.comma_str(len(features))))

                    _f, _l = WordSpacing.sentence2features_labels(
                        line.strip(),
                        left_gram=left_gram,
                        right_gram=right_gram)
                    features.extend(_f)
                    labels.extend(_l)

            dataset = DataSet(features=features,
                              labels=labels,
                              features_vector=features_vector,
                              labels_vector=labels_vector,
                              name='all')
            log.info('dataset: %s' % dataset)
            log.info('create dataset OK.\n')
            watch.stop('create dataset')

            watch.start('dataset save')
            log.info('split to train, test, validation...')
            datasets = DataSets.to_datasets(dataset,
                                            test_rate=0.1,
                                            valid_rate=0.1,
                                            test_max=10000,
                                            valid_max=1000,
                                            shuffle=True)
            train, test, validation = datasets.train, datasets.test, datasets.validation
            log.info(train)
            log.info(test)
            log.info(validation)
            # log.info('%s %s' % (test.features[0], test.labels[0]))
            log.info('split to train, test, validation OK.\n')

            log.info('dataset save... %s' % train_file)
            train.save(train_file, verbose=True)  # save as text
            log.info('dataset save OK.\n')

            log.info('dataset save... %s' % validation_file)
            validation = validation.convert_to_one_hot_vector(
                verbose=True)  # save as vector
            validation.save(validation_file, verbose=True)
            log.info('dataset save OK.\n')

            log.info('dataset save... %s' % test_file)
            test = test.convert_to_one_hot_vector(verbose=True)
            test.save(test_file, verbose=True)  # save as vector
            log.info('dataset save OK.\n')
            watch.stop('dataset save')
        else:
            watch.start('dataset load')
            log.info('dataset load...')
            train = DataSet.load(train_file, verbose=True)
            validation = DataSet.load(validation_file, verbose=True)
            test = DataSet.load(test_file, verbose=True)
            log.info(train)
            log.info(validation)
            log.info(test)
            log.info('dataset load OK.\n')
            watch.stop('dataset load')

        log.info('check samples...')
        for i, (features_batch, labels_batch) in enumerate(
                train.next_batch(batch_size=5, to_one_hot_vector=True), 1):
            if i > 2:
                break
            for a, b in zip(features_batch, labels_batch):
                feature, label = a, b
                _feature = feature.reshape((ngram, len(features_vector)))
                chars = ''.join(features_vector.to_values(_feature))
                has_space = np.argmax(label)
                log.info('[%s] %s -> %s, %s (len=%s) %s (len=%s)' %
                         (i, chars, has_space, feature, len(feature), label,
                          len(label)))
        log.info('check samples OK.\n')

        graph = WordSpacing.build_FFNN(n_features,
                                       n_classes,
                                       n_hidden1,
                                       learning_rate,
                                       watch,
                                       layers=layers)

        train_step, X, Y, cost, hypothesis, predicted, accuracy = graph[
            'train_step'], graph['X'], graph['Y'], graph['cost'], graph[
                'hypothesis'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            n_input = 0
            log.info('total: %s' % NumUtil.comma_str(train.size))
            log.info('learn...')
            watch.start('learn')
            for step, (features_batch, labels_batch) in enumerate(
                    train.next_batch(batch_size=batch_size), 1):
                n_input += batch_size
                sess.run(train_step,
                         feed_dict={
                             X: features_batch,
                             Y: labels_batch
                         })
                log.info(
                    '[%s][%.1f%%] validation cost: %.4f' %
                    (NumUtil.comma_str(n_input), n_input / train.size * 100,
                     sess.run(cost,
                              feed_dict={
                                  X: validation.features,
                                  Y: validation.labels
                              })))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('evaluate...')
            watch.start('evaluate...')
            _hypothesis, _correct, _accuracy = sess.run(
                [hypothesis, predicted, accuracy],
                feed_dict={
                    X: test.features,
                    Y: test.labels
                })  # Accuracy report
            watch.stop('evaluate...')
            log.info('evaluate OK.')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info(watch.summary())
        # log.info('hypothesis: %s %s' % (_hypothesis.shape, _hypothesis))
        # log.info('correct: %s %s' % (_correct.shape, _correct))
        log.info('accuracy: %s %s' % (_accuracy.shape, _accuracy))
        log.info('\n')
예제 #18
0
def create_graph(model_name,
                 scope_name,
                 first_pipeline,
                 second_pipeline,
                 verbose=False):
    """
    create or reuse graph
    :param model_name:
    :param first_pipeline:
    :param second_pipeline:
    :param scope_name:
    :param verbose: print graph nodes
    :return: tensorflow graph nodes
    """
    with tf.variable_scope('common'):  # for reusing graph
        use_first_pipeline = tf.placeholder(dtype=bool)
        learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

        W1 = tf.get_variable(dtype=tf.float32,
                             shape=[input_len, output_len],
                             initializer=tf.random_normal_initializer(),
                             name='W1')
        b1 = tf.get_variable(dtype=tf.float32,
                             initializer=tf.constant(0.0, shape=[output_len]),
                             name='b1')

        x, y = tf.cond(use_first_pipeline, lambda: first_pipeline,
                       lambda: second_pipeline)

        y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat')
        cost = tf.reduce_mean(tf.square(y_hat - y), name='cost')
        train_step = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(cost, name='train_step')

    with tf.variable_scope(scope_name, reuse=None):
        _W1 = tf.summary.histogram(values=W1, name='_W1')
        _b1 = tf.summary.histogram(values=b1, name='_b1')
        _cost = tf.summary.scalar(tensor=cost, name='_cost')
        summary = tf.summary.merge([_W1, _b1, _cost],
                                   name='summary')  # merge_all()
        if verbose:
            log.info('')
            log.info(x)
            log.info(W1)
            log.info(b1)
            log.info('')
            log.info(y)
            log.info(y_hat)
            log.info(cost)
    return x, y, learning_rate, use_first_pipeline, W1, b1, y_hat, cost, train_step, summary
if __name__ == '__main__':
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # ignore tensorflow warnings
    tf.logging.set_verbosity(tf.logging.ERROR)  # ignore tensorflow info

    func = multiply
    n_features = 2  # x1, x2
    n_classes = 1  # y
    digits = list(range(-99, 100, 1))
    n_train, n_test = 4000, 100  # 1% of 200 * 200

    x_data = np.random.choice(digits, (n_train + n_test, n_features), replace=True)
    y_data = func(x_data)
    x_train, x_test = x_data[:n_train], x_data[n_train:]
    y_train, y_test = y_data[:n_train], y_data[n_train:]

    log.info('')
    log.info('func: %s' % func.__name__)
    log.info('digits: %s ~ %s ' % (min(digits), max(digits)))
    log.info('x_train: %s' % str(x_train.shape))
    log.info(x_data[:5])
    log.info('y_train: %s' % str(y_train.shape))
    log.info(y_data[:5])
    log.info('x_test: %s' % str(x_test.shape))
    log.info('y_test %s' % str(y_test.shape))

    valid_check_interval = 0.5
    bias_value = 0.0
    early_stop_cost = 0.1  # stop learning

    # default values
    optimizer = tf.train.AdamOptimizer
예제 #20
0
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

from bage_utils.base_util import is_server
from nlp4kor.config import MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR, log

if __name__ == '__main__':
    mnist_data = os.path.join(MNIST_DATA_DIR, MNIST_DAE_MODEL_DIR)  # input
    device2use = '/gpu:0' if is_server() else '/cpu:0'

    model_file = os.path.join(MNIST_DAE_MODEL_DIR,
                              'dae_mnist_model≤/model')  # .%s' % max_sentences
    log.info('model_file: %s' % model_file)

    model_dir = os.path.dirname(model_file)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    image_shape = (28, 28)
    mnist = input_data.read_data_sets(mnist_data, one_hot=True)
    assert (mnist.train.images.shape[1] == mnist.test.images.shape[1])
    n_input_dim = mnist.train.images.shape[
        1]  # MNIST data input (img shape: 28*28)
    n_output_dim = n_input_dim  # MNIST data input (img shape: 28*28)
    n_hidden_1 = 256  # 1st layer num features
    n_hidden_2 = 256  # 2nd layer num features

    log.info('n_input_dim: %s' % n_input_dim)
예제 #21
0
            info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs))
            info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs))
            info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total))
            info_f.write('train: %s\n' % NumUtil.comma_str(n_train))
            info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid))
            info_f.write('test: %s\n' % NumUtil.comma_str(n_test))
            info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))


if __name__ == '__main__':
    info_file = KO_WIKIPEDIA_ORG_INFO_FILE
    urls_file = KO_WIKIPEDIA_ORG_URLS_FILE
    sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE
    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('info_file: %s' % info_file)
    log.info('urls_file: %s' % urls_file)
    log.info('sentences_file: %s' % sentences_file)
    log.info('characters_file: %s' % characters_file)

    if not os.path.exists(characters_file) or not os.path.exists(sentences_file) or not os.path.exists(info_file) or not os.path.exists(urls_file):
        try:
            log.info('create senences file...')
            TextPreprocess.dump_corpus(MONGO_URL, db_name='parsed', collection_name='ko.wikipedia.org', sentences_file=sentences_file,
                                       characters_file=characters_file,
                                       info_file=info_file, urls_file=urls_file,
                                       train_sentences_file=KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE,
                                       valid_sentences_file=KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE,
                                       test_sentences_file=KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE,
                                       mongo_query={})  # mongodb -> text file(corpus)
            log.info('create senences file OK')
예제 #22
0
        total_spaces = labels1.count(1)  # 정답에 있는 공백 개수
        correct = total_spaces - incorrect  # 정답에 있는 공백과 같은 곳에 공백이 있는지

        if total_spaces == 0:
            sim = 1
        else:
            sim = correct / total_spaces
        return sim, correct, total_spaces


if __name__ == '__main__':
    train_sentences_file = KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE
    valid_sentences_file = KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE
    test_sentences_file = KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE
    log.info('train_sentences_file: %s' % train_sentences_file)
    log.info('valid_sentences_file: %s' % valid_sentences_file)
    log.info('test_sentences_file: %s' % test_sentences_file)

    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('characters_file: %s' % characters_file)
    try:
        if len(sys.argv) == 4:
            n_train = int(sys.argv[1])
            left_gram = int(sys.argv[2])
            right_gram = int(sys.argv[3])
        else:
            n_train, left_gram, right_gram = 1000000, 3, 3
            # n_train, left_gram, right_gram = int('1,000,000'.replace(',', '')), 2, 2

        if left_gram is None:
예제 #23
0
def create_graph(scope_name, input_len=2, output_len=1, verbose=False):
    """
    create or reuse graph
    :param output_len: x1, x2
    :param input_len: y
    :param scope_name:
    :param verbose: print graph nodes
    :return: tensorflow graph nodes
    """
    with tf.variable_scope('common') as variable_scope:  # for reusing graph
        learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

        x = tf.placeholder(dtype=tf.float32, shape=[None, input_len], name='x')
        y = tf.placeholder(dtype=tf.float32,
                           shape=[None, output_len],
                           name='y')

        W1 = tf.get_variable(dtype=tf.float32,
                             shape=[input_len, output_len],
                             initializer=tf.random_normal_initializer(),
                             name='W1')
        b1 = tf.get_variable(dtype=tf.float32,
                             initializer=tf.constant(0.0, shape=[output_len]),
                             name='b1')

        y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat')
        cost = tf.reduce_mean(tf.square(y_hat - y), name='cost')
        train_step = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                            name='optimizer').minimize(
                                                cost, name='train_step')

    with tf.variable_scope(scope_name, reuse=None) as scope:
        _W1 = tf.summary.histogram(values=W1, name='_W1')
        _b1 = tf.summary.histogram(values=b1, name='_b1')
        _cost = tf.summary.scalar(tensor=cost, name='_cost')
        summary = tf.summary.merge([_W1, _b1, _cost])

    if verbose:
        log.info('')
        log.info(x)
        log.info(W1)
        log.info(b1)
        log.info('')
        log.info(y)
        log.info(y_hat)
        log.info(cost)
    return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary
예제 #24
0
        exit()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    filenames = [data_file]
    features_batch, labels_batch = input_pipeline(filenames,
                                                  batch_size=batch_size,
                                                  shuffle=shuffle,
                                                  tokens=2)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        coordinator = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coordinator)
        log.info('coordinator: %s' % coordinator)
        log.info('threads: %s, %s' % (len(threads), threads))
        try:
            for nth_batch in range(5):
                if coordinator.should_stop():
                    break

                _features_batch, _labels_batch = sess.run(
                    [features_batch, labels_batch])
                log.info('')
                log.info('nth_batch: %s' % nth_batch)
                for _f, _l in zip(_features_batch, _labels_batch):
                    log.info('%s %s' % (_f.decode('utf8'),
                                        _l.decode('utf8')))  # decode for print
        except:
            log.info(traceback.format_exc())
예제 #25
0
    def dump_corpus(mongo_url, db_name, collection_name, sentences_file, characters_file, info_file, urls_file,
                    train_sentences_file, valid_sentences_file, test_sentences_file,
                    mongo_query=None, limit=None):
        """
        Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.)
        :param characters_file:
        :param urls_file:
        :param info_file:
        :param mongo_url: mongodb://~~~
        :param db_name: database name of mongodb
        :param collection_name: collection name of mongodb
        :param sentences_file: *.sentence file
        :param train_sentences_file:
        :param valid_sentences_file:
        :param test_sentences_file:
        :param mongo_query: default={}
        :param limit:
        :return:
        """
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name)
        total_docs = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total_docs)))

        output_dir = os.path.basename(sentences_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with gzip.open(sentences_file, 'wt') as out_f, \
                gzip.open(train_sentences_file, 'wt') as train_f, \
                gzip.open(valid_sentences_file, 'wt') as valid_f, \
                gzip.open(test_sentences_file, 'wt') as test_f, \
                open(info_file, 'wt') as info_f, \
                open(urls_file, 'wt') as urls_f:

            char_set = set()
            n_docs = n_total = n_train = n_valid = n_test = 0
            if limit:
                cursor = corpus_mongo.find(mongo_query, limit=limit)
            else:
                cursor = corpus_mongo.find(mongo_query)

            for i, row in enumerate(cursor, 1):
                if i % 1000 == 0:
                    log.info('%s %.1f%% writed.' % (os.path.basename(sentences_file), i / total_docs * 100))

                sentences = []
                for c in row['content']:
                    sentences.extend(HangulUtil.text2sentences(c['sentences'], remove_only_one_word=True, has_hangul=True))
                # sentences = HangulUtil.text2sentences(row['content'], remove_only_one_word=True, has_hangul=True)

                log.debug('url: %s, len: %s' % (row['url'], len(sentences)))
                if len(sentences) == 0:
                    # log.error(row['content'])
                    continue

                urls_f.write(row['url'])
                urls_f.write('\n')
                n_docs += 1

                for s in sentences:
                    _char_set = set([c for c in s])
                    char_set.update(_char_set)

                    n_total += 1
                    out_f.write(s)
                    out_f.write('\n')

                if len(sentences) >= 10:  # can split
                    test_len = valid_len = len(sentences) // 10
                    # log.info('train: %s, test: %s, valid: %s' % (len(sentences) - test_len - valid_len, test_len, valid_len))
                    for s in sentences[:test_len]:
                        n_test += 1
                        test_f.write(s)
                        test_f.write('\n')
                    for s in sentences[test_len:test_len + valid_len]:
                        n_valid += 1
                        valid_f.write(s)
                        valid_f.write('\n')
                    for s in sentences[test_len + valid_len:]:
                        n_train += 1
                        train_f.write(s)
                        train_f.write('\n')
                else:  # can't split
                    for s in sentences:
                        n_train += 1
                        train_f.write(s)
                        train_f.write('\n')

            char_list = list(char_set)
            char_list.sort()
            log.info('writed to %s...' % characters_file)
            with open(characters_file, 'w') as f:
                for c in char_list:
                    f.write(c)
                    f.write('\n')
            log.info('writed to %s OK.' % characters_file)

            log.info('total docs: %s', NumUtil.comma_str(total_docs))
            log.info('total docs: %s (has hangul sentence)', NumUtil.comma_str(n_docs))
            log.info('total sentences: %s (has hangul sentence)', NumUtil.comma_str(n_total))
            log.info('train: %s', NumUtil.comma_str(n_train))
            log.info('valid: %s', NumUtil.comma_str(n_valid))
            log.info('test: %s', NumUtil.comma_str(n_test))
            log.info('total characters: %s', NumUtil.comma_str(len(char_list)))

            info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs))
            info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs))
            info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total))
            info_f.write('train: %s\n' % NumUtil.comma_str(n_train))
            info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid))
            info_f.write('test: %s\n' % NumUtil.comma_str(n_test))
            info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))
예제 #26
0
            if l == 1 and labels2[idx] != 1:
                incorrect += 1

        total_spaces = labels1.count(1)  # 정답에 있는 공백 개수
        correct = total_spaces - incorrect  # 정답에 있는 공백과 같은 곳에 공백이 있는지

        if total_spaces == 0:
            sim = 1
        else:
            sim = correct / total_spaces
        return sim, correct, total_spaces


if __name__ == '__main__':
    sentences_file = KO_WIKIPEDIA_ORG_SENTENCES_FILE
    log.info('sentences_file: %s' % sentences_file)

    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('characters_file: %s' % characters_file)
    try:
        if len(sys.argv) == 4:
            max_sentences = int(sys.argv[1])
            left_gram = int(sys.argv[2])
            right_gram = int(sys.argv[3])
        else:
            max_sentences, left_gram, right_gram = None, None, None

        if max_sentences is None:
            max_sentences = int('1,000,000'.replace(',', ''))
            # max_sentences = int('1,000,000'.replace(',', '')) if is_my_pc() else int('1,000,000'.replace(',', ''))  # run 100 or 1M data (학습: 17시간 소요)
            # max_sentences = int('1,000,000'.replace(',', '')) if is_my_pc() else FileUtil.count_lines(sentences_file, gzip_format=True)  # run 100 or full data (학습시간: 5일 소요)
예제 #27
0
    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100,
                 learning_rate=0.01, early_stop_cost=0.001):
        ngram = left_gram + right_gram
        n_features = len(features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram))

        log.info('train_file: %s' % train_file)
        log.info('valid_file: %s' % valid_file)
        log.info('test_file: %s' % test_file)
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for name, data_file, total, dataset_file, to_one_hot_vector in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))

                features, labels = [], []
                with gzip.open(data_file, 'rt') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram)
                        features.extend(_f)
                        labels.extend(_l)

                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch)

        train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = 10  # max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('learn...')
            log.info('total: %s' % NumUtil.comma_str(train.size))
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1):
                    if valid_cost < early_stop_cost:
                        log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                        break
                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch})

                    # if step % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info('batch_size: %s' % batch_size)
        log.info(watch.summary())
        log.info('\n')
예제 #28
0
    output_len = 1  # y
    _learning_rate = 0.01

    n_train, n_valid, n_test = 1000, 100, 10
    if not os.path.exists(train_file):
        create_data4add(train_file, n_train, digit_max=99)
    if not os.path.exists(valid_file):
        create_data4add(valid_file, n_valid, digit_max=99)
    if not os.path.exists(test_file):
        create_data4add(test_file, n_test, digit_max=99)

    for training_mode in [True, False]:  # training & testing
        for batch_size in [1, 10, 100]:
            tf.reset_default_graph(
            )  # Clears the default graph stack and resets the global default graph.
            log.info('')
            log.info(
                'training_mode: %s, batch_size: %s, total_train_time: %s secs'
                % (training_mode, batch_size, total_train_time))

            model_name = os.path.basename(__file__).replace('.py', '')
            model_file = os.path.join(
                MODELS_DIR,
                '%s.n_train_%s.batch_size_%s.total_train_time_%s/model' %
                (model_name, n_train, batch_size, total_train_time))
            model_dir = os.path.dirname(model_file)
            log.info('model_name: %s' % model_name)
            log.info('model_file: %s' % model_file)

            scope_name = '%s.%s.batch_size_%s.total_train_time_%s' % (
                model_name, DateUtil.current_yyyymmdd_hhmm(), batch_size,
예제 #29
0
    def build_FFNN(cls, n_features, n_classes, n_hidden1, learning_rate, watch=WatchUtil()):  # TODO: 2 layers
        log.info('\nbuild_FFNN')
        if len(cls.graph_nodes) == 0:
            n_hidden3 = n_hidden2 = n_hidden1
            log.info('create tensorflow graph...')
            watch.start('create tensorflow graph')
            log.info('n_features: %s' % n_features)
            log.info('n_classes: %s' % n_classes)
            log.info('n_hidden1: %s' % n_hidden1)
            log.info('n_hidden2: %s' % n_hidden2)
            log.info('n_hidden3: %s' % n_hidden3)

            tf.set_random_seed(777)  # for reproducibility

            X = tf.placeholder(tf.float32, [None, n_features], name='X')  # two characters
            Y = tf.placeholder(tf.float32, [None, n_classes], name='Y')

            # W1 = tf.Variable(tf.truncated_normal([n_features, n_hidden1], mean=0.0, stddev=0.1), name='W1')
            # b1 = tf.Variable(tf.constant(0.1, shape=[n_hidden1]), name='b1')
            W1 = tf.Variable(tf.random_normal([n_features, n_hidden1]), name='W1')
            b1 = tf.Variable(tf.random_normal([n_hidden1]), name='b1')
            layer1 = tf.nn.relu(tf.matmul(X, W1) + b1, name='layer1')

            W2 = tf.Variable(tf.random_normal([n_hidden1, n_hidden2]), name='W2')
            b2 = tf.Variable(tf.random_normal([n_hidden2]), name='b2')
            layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2, name='layer2')

            W3 = tf.Variable(tf.random_normal([n_hidden2, n_hidden3]), name='W3')
            b3 = tf.Variable(tf.random_normal([n_hidden3]), name='b3')
            layer3 = tf.nn.relu(tf.matmul(layer2, W3) + b3, name='layer3')

            W4 = tf.Variable(tf.random_normal([n_hidden3, n_classes]), name='W4')
            b4 = tf.Variable(tf.random_normal([n_classes]), name='b4')
            y_hat = tf.add(tf.matmul(layer3, W4), b4, name='y_hat')

            # cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), name='cost')  # cost/loss function
            cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=Y), name='cost')

            train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
                cost)  # Very Very good!! sentences=10000 + layer=4, 10분, accuracy 0.9294, cost: 0.1839

            predicted = tf.cast(y_hat > 0.5, dtype=tf.float32, name='predicted')  # 0 <= hypothesis <= 1

            accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32), name='accuracy')

            watch.stop('create tensorflow graph')
            log.info('create tensorflow graph OK.\n')
            cls.graph_nodes = {'predicted': predicted, 'accuracy': accuracy, 'X': X, 'Y': Y, 'train_step': train_step, 'cost': cost}
        return cls.graph_nodes
예제 #30
0
def create_graph(scope_name, mode, input_file, input_len=2, output_len=1, batch_size=1, verbose=False, reuse=None, n_threads=2):
    """
    create or reuse graph
    :param scope_name: variable scope name
    :param mode: 'train', 'valid', 'test'
    :param input_file: train or valid or test file path
    :param input_len: x1, x2
    :param output_len: y
    :param batch_size: batch size > 0
    :param verbose: print graph nodes
    :param reuse: reuse graph or not
    :param n_threads: number of example enqueue threands (2 is enough)
    :return: tensorflow graph nodes
    """
    with tf.variable_scope('common', reuse=reuse):  # for reusing graph
        W1 = tf.get_variable(dtype=tf.float32, shape=[input_len, output_len], initializer=tf.random_normal_initializer(), name='W1')
        b1 = tf.get_variable(dtype=tf.float32, initializer=tf.constant(0.0, shape=[output_len]), name='b1')
        learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

    with tf.variable_scope(mode, reuse=None):
        x, y = input_pipeline([input_file], batch_size=batch_size, delim='\t', splits=3, n_threads=n_threads)
        y_hat = tf.add(tf.matmul(x, W1), b1, name='y_hat')
        cost = tf.reduce_mean(tf.square(y_hat - y), name='cost')
        train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, name='train_step')

    with tf.variable_scope(scope_name, reuse=None):
        _W1 = tf.summary.histogram(values=W1, name='_W1')
        _b1 = tf.summary.histogram(values=b1, name='_b1')
        _cost = tf.summary.scalar(tensor=cost, name='_cost')
        summary = tf.summary.merge([_W1, _b1, _cost], name='summary')  # merge_all()
        if verbose:
            log.info('')
            log.info(x)
            log.info(W1)
            log.info(b1)
            log.info('')
            log.info(y)
            log.info(y_hat)
            log.info(cost)
    return x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary