def load_data(w2v_model): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors print("Loading data...") x_text, y = load_data_and_labels(FLAGS.train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) # 文本最长长度 print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) if (w2v_model is None): vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: x = get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def load_data(w2v_model=None): print("laoding data") x_text, y = data_helpers.load_data_and_labels(train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) if (w2v_model == None): vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) else: x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def deal_data(self, text, max_document_length = 10): words = jieba.cut(text) x_text = [' '.join(words)] x = data_helpers.get_text_idx(x_text, self.w2v_wr.model.vocab_hash, max_document_length) return x
def deal_data(self, text, max_document_length = 10): words = jieba.cut(text)#jieba分词 x_text = [' '.join(words)]#把分出来的词用空格隔开 x = data_helpers.get_text_idx(x_text, self.w2v_wr.model.vocab_hash, max_document_length) #使用data_input_helper.py里的get_text_idx函数 return x
def load_data(embedding_model, dataset_pickle_path): """Loads starter word-vectors and train/dev/test data. Input: embedding_model: WordEmbeddingModel对象,这里并不特定指定使用某个词向量模型 dataset_pickle_path: 数据集路径,可以是针对某个subject的数据集 Output: x_train, y_train, x_dev, y_dev: 训练集样本以及标签,验证集样本以及标签 vocab_size: 词典大小 """ import pickle # Load the starter word vectors print("Loading data...") #x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) # 当前只读入subject类别信息 #content_id_list, x_text, y, _ = pickle.load(open("../pickles/preprocessed-train-dataset.pickle")) content_id_list, x_text, y = pickle.load(open(dataset_pickle_path, 'rb')) # 每句话最多的单词数 max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if embedding_model is None: vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: # embedding_model.vocab_hash 相对于一个dict print('Using word embeddings!') x = data_helpers.get_text_idx(x_text, embedding_model.vocab_hash, max_document_length) vocab_size = len(embedding_model.vocab_hash) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def load_data(w2v_model, max_document_length=1290): print("Loading data...") x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file) y_test = np.argmax(y_test, axis=1) if (max_document_length == 0): max_document_length = max([len(x.split(" ")) for x in x_text]) print('max_document_length = ', max_document_length) x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) return x, y_test
def load_data(w2v_model): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) # for x in x_text: # l = len(x.split(" ")) # break max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if (w2v_model is None): vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) # print(type(shuffle_indices)) # <class 'numpy.ndarray'> # print(type(x)) # <class 'numpy.ndarray'> # print(x[1]) # [7942 181 949 ... 0 0 0] # print(x[2]) # [7942 174 5 ... 0 0 0] # print(x[1, 2]) # 949 # print(x[[1, 2]]) # [[7942 181 949 ... 0 0 0],[7942 174 5 ... 0 0 0]] # print(x[(1, 2)]) # 949 x_shuffled = x[shuffle_indices] # print(x_shuffled) # exit() y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size
def load_data(w2v_model): """Loads starter word-vectors and train/dev/test data.""" #“加载启动词向量和训练/开发/测试数据。 # Load the starter word vectors加载起始词向量 print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if (w2v_model is None): #learn.preprocessing.VocabularyProcessor(max_document_length) #根据所有已分词好的文本建立好一个词典,然后找出每个词在词典中对应的索引,不足长度或者不存在的词补0 #max_document_length 最大文档长度 vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) #从x_text中学习到一个词汇表并返回一个id矩阵 x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", str(int(time.time())))) vocab_processor.save("vocab.txt") print('save vocab.txt') else: x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') #索引值处理 #训练集和测试集的获取 np.random.seed(10) #设定一个随机数种子 shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size #返回训练集和测试集,还有词向量大小
def load_data(w2v_model,max_document_length = 20): """Loads starter word-vectors and train/dev/test data.""" # Load the starter word vectors print("Loading data...") x_text, y_test = data_helpers.load_data_and_labels(FLAGS.valid_data_file) y_test = np.argmax(y_test, axis=1) if(max_document_length == 0) : max_document_length = max([len(x.split(" ")) for x in x_text]) print ('max_document_length = ' , max_document_length) x = data_helpers.get_text_idx(x_text,w2v_model.vocab_hash,max_document_length) return x,y_test
def load_data(w2v_model): print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.train_data_file) max_document_length = max([len(x.split(" ")) for x in x_text]) print('len(x) = ', len(x_text), ' ', len(y)) print(' max_document_length = ', max_document_length) x = [] vocab_size = 0 if (w2v_model is None): # 随机初始化 vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) vocab_processor.save("vocab.dat") print('save vocab.dat') else: # 加载离线w2v x = data_helpers.get_text_idx(x_text, w2v_model.vocab_hash, max_document_length) vocab_size = len(w2v_model.vocab_hash) print('use w2v .bin') # shuffle np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] return x_train, x_dev, y_train, y_dev, vocab_size