def preprocess(params):
    # Data processing: encode text inputs into numeric vectors
    processor = preprocessing.VocabularyProcessor(
        max_document_length=params.max_sequence_length,
        min_frequency=params.min_frequency)
    encoded_inputs = list(processor.fit_transform(inputs))
    vocab_size = len(processor.vocabulary_)

    encoded_inputs = np.array(encoded_inputs)
    encoded_labels = np.array([int(label == 'ham') for label in labels])

    # Shuffle and split data
    np.random.seed(0)
    shuffled_ix = np.random.permutation(np.arange(len(encoded_labels)))
    x_shuffled = encoded_inputs[shuffled_ix]
    y_shuffled = encoded_labels[shuffled_ix]

    # Split train/test set
    ix_cutoff = int(len(y_shuffled) * 0.80)
    x_train, x_test = x_shuffled[:ix_cutoff], x_shuffled[ix_cutoff:]
    y_train, y_test = y_shuffled[:ix_cutoff], y_shuffled[ix_cutoff:]

    if hype.util.is_debug_logged():
        hype.util.debug('Vocabulary size: %d' % vocab_size)
        hype.util.debug('Train/test split: train=%d, test=%d' %
                        (len(y_train), len(y_test)))

    train = hype.DataSet(x_train, y_train)
    test = hype.DataSet(x_test, y_test)
    data = hype.Data(train, test, test)

    return data, vocab_size
Exemplo n.º 2
0
def create_vocab(input_iter, min_freq):
    print('begin ---')
    vp = preprocessing.VocabularyProcessor(
        max_document_length=MAX_SENTENCE_LEN * MAX_NUM_UTTER,
        min_frequency=min_freq,
        tokenizer_fn=tokenizer_fn)
    print('begin fit -----')
    vp.fit(input_iter)
    return vp
Exemplo n.º 3
0
Arquivo: 2.py Projeto: Lee077/AI
def test():
    text_list = ['苹果 是 什么 垃圾', '塑料瓶 是 那种 垃圾']  #先用结巴分好词
    max_words_length = 10
    vocab_processor = preprocessing.VocabularyProcessor(
        max_document_length=max_words_length)
    x = np.array(list(vocab_processor.fit_transform(text_list)))

    print('x:\n', x)

    print('词-索引映射表:\n', vocab_processor.vocabulary_._mapping)

    print('词汇表:\n', vocab_processor.vocabulary_._reverse_mapping)

    #保存vocabulary
    vocab_processor.save('vocab.pkl')
def preprocess(params):
    # Data processing: encode text inputs into numeric vectors
    processor = preprocessing.VocabularyProcessor(
        max_document_length=params.max_sequence_length,
        min_frequency=params.min_frequency)
    encoded_inputs = list(processor.fit_transform(inputs))
    vocab_size = len(processor.vocabulary_)

    # Set this to see verbose output:
    #   hype.util.set_verbose()

    if hype.util.is_debug_logged():
        hype.util.debug('Encoded text examples:')
        for i in range(3):
            hype.util.debug('  %s ->' % inputs[i])
            hype.util.debug('  %s\n' % encoded_inputs[i].tolist())

    encoded_inputs = np.array(encoded_inputs)
    encoded_labels = np.array([int(label == 'ham') for label in labels])

    # Shuffle the data
    shuffled_ix = np.random.permutation(np.arange(len(encoded_labels)))
    x_shuffled = encoded_inputs[shuffled_ix]
    y_shuffled = encoded_labels[shuffled_ix]

    # Split into train/validation/test sets
    idx1 = int(len(y_shuffled) * 0.75)
    idx2 = int(len(y_shuffled) * 0.85)
    x_train, x_val, x_test = x_shuffled[:idx1], x_shuffled[
        idx1:idx2], x_shuffled[idx2:]
    y_train, y_val, y_test = y_shuffled[:idx1], y_shuffled[
        idx1:idx2], y_shuffled[idx2:]

    if hype.util.is_debug_logged():
        hype.util.debug('Vocabulary size: %d' % vocab_size)
        hype.util.debug(
            'Train/validation/test split: train=%d, val=%d, test=%d' %
            (len(y_train), len(y_val), len(y_test)))

    train = hype.DataSet(x_train, y_train)
    validation = hype.DataSet(x_val, y_val)
    test = hype.DataSet(x_test, y_test)
    data = hype.Data(train, validation, test)

    return data, vocab_size
Exemplo n.º 5
0
 def building_model(documents,
                    save_path,
                    max_document_length=512,
                    vocabulary=None,
                    split_fn=default_split_fn):
     """
     基于传入的文档数据构建字典相关信息
     :param documents: 进行模型训练的时候的文本数据
     :param save_path: 模型持久化的路径
     :param vocabulary: 词汇映射表
     :param split_fn: 将文本转换为单词过程中的函数, 默认是将每个字当作一个单词
     :param max_document_length: 将文本单词id转换的时候,最长文本允许的单词数目
     :return:
     """
     tf.logging.info("开始构建词汇转换模型.....")
     model = preprocessing.VocabularyProcessor(
         max_document_length=max_document_length,
         vocabulary=vocabulary,
         tokenizer_fn=split_fn)
     model.fit(raw_documents=documents)
     tf.logging.info("词汇转换模型构建完成,开始模型保存操作!!!")
     model.save(save_path)
     tf.logging.info("词汇转换模型保存完成,保存位置为:{}".format(save_path))
    def _read_raw_data(self):

        XData = []
        xLengths = []
        YData = []
        names = []

        self.print('======= Reading pre-made vector files... =======')
        self.print('Data source: ' + self.inputSource)

        numSkipped = 0

        with open(self.inputSource, encoding='utf8') as ifile:

            for d in json.load(ifile):
                occ = d['occupation']
                content = d['content']
                numTokens = len(content.split(' '))

                if numTokens < self.minimumWords:
                    numSkipped += 1
                    continue

                XData.append(content)
                xLengths.append(numTokens)
                YData.append(occ if type(occ) == str else occ[-1])
                names.append(d['name'])

        self.print('%d out of %d skipped' % (numSkipped, numSkipped + len(XData)))
        self.maxXLen = max(xLengths)

        self.vocabProcessor = preprocessing.VocabularyProcessor(self.maxXLen)
        XData = list(self.vocabProcessor.fit_transform(XData))
        self.vocabSize = len(self.vocabProcessor.vocabulary_)

        return np.array(XData), np.array(YData), np.array(xLengths), np.array(names)
Exemplo n.º 7
0
    def __init__(self, pos_data, neg_data, word_vec, split_ratio=0.8):
        # load train and test data
        print("load data...")
        self.data_x, self.data_y = self.load_data(pos_data, neg_data)

        # Build vocabulary
        self.max_document_length = max(
            [len(x.split(" ")) for x in self.data_x])
        self.vocab_processor = preprocessing.VocabularyProcessor(
            self.max_document_length)
        self.data_x = np.array(
            list(self.vocab_processor.fit_transform(self.data_x)))

        # Randomly shuffle data
        np.random.seed(10)
        shuffle_indices = np.random.permutation(np.arange(len(self.data_y)))
        x_shuffled = self.data_x[shuffle_indices]
        y_shuffled = self.data_y[shuffle_indices]

        dev_sample_index = int(len(y_shuffled) * split_ratio)
        self.train_x, self.test_x = x_shuffled[:dev_sample_index], x_shuffled[
            dev_sample_index:]
        self.train_y, self.test_y = y_shuffled[:dev_sample_index], y_shuffled[
            dev_sample_index:]
Exemplo n.º 8
0
    def fit(self, X, y, sample_weights=None):
        self.__class_labels.clear()
        self.__define_flags()

        self.classes_ = sorted(list(set(y)))
        x_text, y = self.__transform_data(X, y, sample_weights)
        max_doc_length = max([len(x.split(" ")) for x in x_text])
        self.__vocab_proc = preprocessing.VocabularyProcessor(max_doc_length)
        x = np.array(list(self.__vocab_proc.fit_transform(x_text)))

        np.random.seed(self.__random_state)

        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x_shuffled = x[shuffle_indices]
        y_shuffled = y[shuffle_indices]
        dev_sample_index = -1 * int(self.__flags.dev_sample_percentage *
                                    float(len(y)))
        x_train = x_shuffled[:dev_sample_index]
        x_dev = x_shuffled[dev_sample_index:]
        y_train = y_shuffled[:dev_sample_index]
        y_dev = y_shuffled[dev_sample_index:]

        with tf.Graph().as_default():
            asp = self.__flags.allow_soft_placement
            ldp = self.__flags.log_device_placement
            session_conf = tf.ConfigProto(allow_soft_placement=asp,
                                          log_device_placement=ldp)
            sess = tf.Session(config=session_conf)

            with sess.as_default():
                vocab_size = len(self.__vocab_proc.vocabulary_)
                embedding_size = self.__flags.embedding_dim
                filter_sizes = self.__flags.filter_sizes.split(",")
                filter_sizes = list(map(int, filter_sizes))
                cnn = self.__TextCNN(sequence_length=x_train.shape[1],
                                     num_classes=y_train.shape[1],
                                     vocab_size=vocab_size,
                                     embedding_size=embedding_size,
                                     filter_sizes=filter_sizes,
                                     num_filters=self.__flags.num_filters,
                                     l2_reg_lambda=self.__flags.l2_reg_lambda,
                                     word2vec=self.__w2v)
                global_step = tf.Variable(0, name="global_step",
                                          trainable=False)
                optimizer = tf.train.AdamOptimizer(1e-3)
                grads_and_vars = optimizer.compute_gradients(cnn.loss)
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step=global_step)
                grad_summaries = []

                for g, v in grads_and_vars:
                    if g is not None:
                        name = v.name.replace(":", "_")
                        histogram = "{}/grad/hist".format(name)
                        grad_hist_summary = tf.summary.histogram(histogram, g)
                        sparsity = "{}/grad/sparsity".format(name)
                        frac = tf.nn.zero_fraction(g)
                        sparsity_summary = tf.summary.scalar(sparsity, frac)

                        grad_summaries.append(grad_hist_summary)
                        grad_summaries.append(sparsity_summary)

                grad_summaries_merged = tf.summary.merge(grad_summaries)
                timestamp = str(int(time.time()))
                out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs",
                                                       timestamp))
                loss_summary = tf.summary.scalar("loss", cnn.loss)
                acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
                train_summary_op = tf.summary.merge([loss_summary, acc_summary,
                                                     grad_summaries_merged])
                train_summary_dir = os.path.join(out_dir, "summaries", "train")
                train_summary_writer = tf.summary.FileWriter(train_summary_dir,
                                                             sess.graph)
                dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
                dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
                dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                           sess.graph)
                path = os.path.join(out_dir, "checkpoints")
                self.__checkpoint_dir = os.path.abspath(path)
                checkpoint_prefix = os.path.join(self.__checkpoint_dir,
                                                 "model")

                if not os.path.exists(self.__checkpoint_dir):
                    os.makedirs(self.__checkpoint_dir)

                max_to_keep = self.__flags.num_checkpoints
                saver = tf.train.Saver(tf.global_variables(),
                                       max_to_keep=max_to_keep)

                sess.run(tf.global_variables_initializer())

                batches = self.__batch_iter(list(zip(x_train, y_train)),
                                            self.__flags.batch_size,
                                            self.__flags.num_epochs)

                for batch in batches:
                    x_batch, y_batch = zip(*batch)

                    self.__train_step(x_batch, y_batch, cnn, sess,
                                      self.__flags.dropout_keep_prob, train_op,
                                      global_step, train_summary_op,
                                      train_summary_writer)

                    current_step = tf.train.global_step(sess, global_step)

                    if current_step % self.__flags.evaluate_every == 0:
                        self.__dev_step(x_dev, y_dev, cnn, sess, global_step,
                                        dev_summary_op,
                                        writer=dev_summary_writer)

                    if current_step % self.__flags.checkpoint_every == 0:
                        saver.save(sess, checkpoint_prefix,
                                   global_step=current_step)

        self.__complete_training()
        return self
Exemplo n.º 9
0
def create_vocab(input_iter, min_freq):
    vp = preprocessing.VocabularyProcessor(max_document_length=100,
                                           min_frequency=min_freq,
                                           tokenizer_fn=tokenizer_fn)
    vp.fit(input_iter)
    return vp
Exemplo n.º 10
0
# coding: utf-8
"""
词汇表模型:
* 袋模型可以很好的表现文本由哪些单词组成,但是却无法表达出单词之间的前后关系,
* 于是人们借鉴了词袋模型的思想,使用生成的词汇表对原有句子按照单词逐个进行编码。
"""

import numpy as np
from tensorflow.contrib.learn import preprocessing as pc

# 语料
corpus = ['i love you', 'me too']

vocab = pc.VocabularyProcessor(max_document_length=4)
"""
VocabularyProcessor 参数
* max_document_length: 文档的最大长度。如果文本的长度大于最大长度,那么它会被剪切,反之则用0填充。
* min_frequency: 词频的最小值,出现次数小于最小词频则不会被收录到词表中。
* vocabulary: CategoricalVocabulary 对象。
* tokenizer_fn: 分词函数。
"""
# 创建词汇表,创建后不能更改
vocab.fit(corpus)

# 获取 Iterator 对象, next 进行遍历
print("Encoding: \n", next(vocab.transform(['i me too'])).tolist())

# 获取 预料 编码后的矩阵向量
mat_corpus = np.array(list(vocab.fit_transform(corpus)))
print("mat_corpus: \n", mat_corpus)