def preprocess(params): # Data processing: encode text inputs into numeric vectors processor = preprocessing.VocabularyProcessor( max_document_length=params.max_sequence_length, min_frequency=params.min_frequency) encoded_inputs = list(processor.fit_transform(inputs)) vocab_size = len(processor.vocabulary_) encoded_inputs = np.array(encoded_inputs) encoded_labels = np.array([int(label == 'ham') for label in labels]) # Shuffle and split data np.random.seed(0) shuffled_ix = np.random.permutation(np.arange(len(encoded_labels))) x_shuffled = encoded_inputs[shuffled_ix] y_shuffled = encoded_labels[shuffled_ix] # Split train/test set ix_cutoff = int(len(y_shuffled) * 0.80) x_train, x_test = x_shuffled[:ix_cutoff], x_shuffled[ix_cutoff:] y_train, y_test = y_shuffled[:ix_cutoff], y_shuffled[ix_cutoff:] if hype.util.is_debug_logged(): hype.util.debug('Vocabulary size: %d' % vocab_size) hype.util.debug('Train/test split: train=%d, test=%d' % (len(y_train), len(y_test))) train = hype.DataSet(x_train, y_train) test = hype.DataSet(x_test, y_test) data = hype.Data(train, test, test) return data, vocab_size
def create_vocab(input_iter, min_freq): print('begin ---') vp = preprocessing.VocabularyProcessor( max_document_length=MAX_SENTENCE_LEN * MAX_NUM_UTTER, min_frequency=min_freq, tokenizer_fn=tokenizer_fn) print('begin fit -----') vp.fit(input_iter) return vp
def test(): text_list = ['苹果 是 什么 垃圾', '塑料瓶 是 那种 垃圾'] #先用结巴分好词 max_words_length = 10 vocab_processor = preprocessing.VocabularyProcessor( max_document_length=max_words_length) x = np.array(list(vocab_processor.fit_transform(text_list))) print('x:\n', x) print('词-索引映射表:\n', vocab_processor.vocabulary_._mapping) print('词汇表:\n', vocab_processor.vocabulary_._reverse_mapping) #保存vocabulary vocab_processor.save('vocab.pkl')
def preprocess(params): # Data processing: encode text inputs into numeric vectors processor = preprocessing.VocabularyProcessor( max_document_length=params.max_sequence_length, min_frequency=params.min_frequency) encoded_inputs = list(processor.fit_transform(inputs)) vocab_size = len(processor.vocabulary_) # Set this to see verbose output: # hype.util.set_verbose() if hype.util.is_debug_logged(): hype.util.debug('Encoded text examples:') for i in range(3): hype.util.debug(' %s ->' % inputs[i]) hype.util.debug(' %s\n' % encoded_inputs[i].tolist()) encoded_inputs = np.array(encoded_inputs) encoded_labels = np.array([int(label == 'ham') for label in labels]) # Shuffle the data shuffled_ix = np.random.permutation(np.arange(len(encoded_labels))) x_shuffled = encoded_inputs[shuffled_ix] y_shuffled = encoded_labels[shuffled_ix] # Split into train/validation/test sets idx1 = int(len(y_shuffled) * 0.75) idx2 = int(len(y_shuffled) * 0.85) x_train, x_val, x_test = x_shuffled[:idx1], x_shuffled[ idx1:idx2], x_shuffled[idx2:] y_train, y_val, y_test = y_shuffled[:idx1], y_shuffled[ idx1:idx2], y_shuffled[idx2:] if hype.util.is_debug_logged(): hype.util.debug('Vocabulary size: %d' % vocab_size) hype.util.debug( 'Train/validation/test split: train=%d, val=%d, test=%d' % (len(y_train), len(y_val), len(y_test))) train = hype.DataSet(x_train, y_train) validation = hype.DataSet(x_val, y_val) test = hype.DataSet(x_test, y_test) data = hype.Data(train, validation, test) return data, vocab_size
def building_model(documents, save_path, max_document_length=512, vocabulary=None, split_fn=default_split_fn): """ 基于传入的文档数据构建字典相关信息 :param documents: 进行模型训练的时候的文本数据 :param save_path: 模型持久化的路径 :param vocabulary: 词汇映射表 :param split_fn: 将文本转换为单词过程中的函数, 默认是将每个字当作一个单词 :param max_document_length: 将文本单词id转换的时候,最长文本允许的单词数目 :return: """ tf.logging.info("开始构建词汇转换模型.....") model = preprocessing.VocabularyProcessor( max_document_length=max_document_length, vocabulary=vocabulary, tokenizer_fn=split_fn) model.fit(raw_documents=documents) tf.logging.info("词汇转换模型构建完成,开始模型保存操作!!!") model.save(save_path) tf.logging.info("词汇转换模型保存完成,保存位置为:{}".format(save_path))
def _read_raw_data(self): XData = [] xLengths = [] YData = [] names = [] self.print('======= Reading pre-made vector files... =======') self.print('Data source: ' + self.inputSource) numSkipped = 0 with open(self.inputSource, encoding='utf8') as ifile: for d in json.load(ifile): occ = d['occupation'] content = d['content'] numTokens = len(content.split(' ')) if numTokens < self.minimumWords: numSkipped += 1 continue XData.append(content) xLengths.append(numTokens) YData.append(occ if type(occ) == str else occ[-1]) names.append(d['name']) self.print('%d out of %d skipped' % (numSkipped, numSkipped + len(XData))) self.maxXLen = max(xLengths) self.vocabProcessor = preprocessing.VocabularyProcessor(self.maxXLen) XData = list(self.vocabProcessor.fit_transform(XData)) self.vocabSize = len(self.vocabProcessor.vocabulary_) return np.array(XData), np.array(YData), np.array(xLengths), np.array(names)
def __init__(self, pos_data, neg_data, word_vec, split_ratio=0.8): # load train and test data print("load data...") self.data_x, self.data_y = self.load_data(pos_data, neg_data) # Build vocabulary self.max_document_length = max( [len(x.split(" ")) for x in self.data_x]) self.vocab_processor = preprocessing.VocabularyProcessor( self.max_document_length) self.data_x = np.array( list(self.vocab_processor.fit_transform(self.data_x))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(self.data_y))) x_shuffled = self.data_x[shuffle_indices] y_shuffled = self.data_y[shuffle_indices] dev_sample_index = int(len(y_shuffled) * split_ratio) self.train_x, self.test_x = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] self.train_y, self.test_y = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:]
def fit(self, X, y, sample_weights=None): self.__class_labels.clear() self.__define_flags() self.classes_ = sorted(list(set(y))) x_text, y = self.__transform_data(X, y, sample_weights) max_doc_length = max([len(x.split(" ")) for x in x_text]) self.__vocab_proc = preprocessing.VocabularyProcessor(max_doc_length) x = np.array(list(self.__vocab_proc.fit_transform(x_text))) np.random.seed(self.__random_state) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(self.__flags.dev_sample_percentage * float(len(y))) x_train = x_shuffled[:dev_sample_index] x_dev = x_shuffled[dev_sample_index:] y_train = y_shuffled[:dev_sample_index] y_dev = y_shuffled[dev_sample_index:] with tf.Graph().as_default(): asp = self.__flags.allow_soft_placement ldp = self.__flags.log_device_placement session_conf = tf.ConfigProto(allow_soft_placement=asp, log_device_placement=ldp) sess = tf.Session(config=session_conf) with sess.as_default(): vocab_size = len(self.__vocab_proc.vocabulary_) embedding_size = self.__flags.embedding_dim filter_sizes = self.__flags.filter_sizes.split(",") filter_sizes = list(map(int, filter_sizes)) cnn = self.__TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=vocab_size, embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=self.__flags.num_filters, l2_reg_lambda=self.__flags.l2_reg_lambda, word2vec=self.__w2v) global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) grad_summaries = [] for g, v in grads_and_vars: if g is not None: name = v.name.replace(":", "_") histogram = "{}/grad/hist".format(name) grad_hist_summary = tf.summary.histogram(histogram, g) sparsity = "{}/grad/sparsity".format(name) frac = tf.nn.zero_fraction(g) sparsity_summary = tf.summary.scalar(sparsity, frac) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) path = os.path.join(out_dir, "checkpoints") self.__checkpoint_dir = os.path.abspath(path) checkpoint_prefix = os.path.join(self.__checkpoint_dir, "model") if not os.path.exists(self.__checkpoint_dir): os.makedirs(self.__checkpoint_dir) max_to_keep = self.__flags.num_checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep) sess.run(tf.global_variables_initializer()) batches = self.__batch_iter(list(zip(x_train, y_train)), self.__flags.batch_size, self.__flags.num_epochs) for batch in batches: x_batch, y_batch = zip(*batch) self.__train_step(x_batch, y_batch, cnn, sess, self.__flags.dropout_keep_prob, train_op, global_step, train_summary_op, train_summary_writer) current_step = tf.train.global_step(sess, global_step) if current_step % self.__flags.evaluate_every == 0: self.__dev_step(x_dev, y_dev, cnn, sess, global_step, dev_summary_op, writer=dev_summary_writer) if current_step % self.__flags.checkpoint_every == 0: saver.save(sess, checkpoint_prefix, global_step=current_step) self.__complete_training() return self
def create_vocab(input_iter, min_freq): vp = preprocessing.VocabularyProcessor(max_document_length=100, min_frequency=min_freq, tokenizer_fn=tokenizer_fn) vp.fit(input_iter) return vp
# coding: utf-8 """ 词汇表模型: * 袋模型可以很好的表现文本由哪些单词组成,但是却无法表达出单词之间的前后关系, * 于是人们借鉴了词袋模型的思想,使用生成的词汇表对原有句子按照单词逐个进行编码。 """ import numpy as np from tensorflow.contrib.learn import preprocessing as pc # 语料 corpus = ['i love you', 'me too'] vocab = pc.VocabularyProcessor(max_document_length=4) """ VocabularyProcessor 参数 * max_document_length: 文档的最大长度。如果文本的长度大于最大长度,那么它会被剪切,反之则用0填充。 * min_frequency: 词频的最小值,出现次数小于最小词频则不会被收录到词表中。 * vocabulary: CategoricalVocabulary 对象。 * tokenizer_fn: 分词函数。 """ # 创建词汇表,创建后不能更改 vocab.fit(corpus) # 获取 Iterator 对象, next 进行遍历 print("Encoding: \n", next(vocab.transform(['i me too'])).tolist()) # 获取 预料 编码后的矩阵向量 mat_corpus = np.array(list(vocab.fit_transform(corpus))) print("mat_corpus: \n", mat_corpus)