def getDataSets(self, training_paths, training_labeled_paths, dev_paths, dev_labeled_paths, max_document_length, percent_dev, batch_size): x1_text, x2_text, y=self.getTsvData(training_paths) ent_x1, ent_x2=self.getEntData(x1_text, x2_text, training_labeled_paths, max_document_length) add_fea = self.getAdditionalFeature(x1_text,x2_text) #print add_fea # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor.fit_transform(np.concatenate((x2_text,x1_text),axis=0)) vocab = vocab_processor.vocabulary_.__dict__['_reverse_mapping'] self.saveMap(vocab) print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) i1=0 train_set=[] dev_set=[] sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) print x1[0] print ent_x1[0] # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] ent_x1_shuffled = ent_x1[shuffle_indices] ent_x2_shuffled = ent_x2[shuffle_indices] y_shuffled = y[shuffle_indices] add_fea_shuffled = add_fea[shuffle_indices] dev_idx = -1*len(y_shuffled)*percent_dev//100 del x1 del x2 del ent_x1 del ent_x2 if dev_paths == None: # Split train/dev set self.dumpValidation(x1_text,x2_text,y,shuffle_indices,dev_idx,0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] ent_x1_train, ent_x1_dev = ent_x1_shuffled[:dev_idx], ent_x1_shuffled[dev_idx:] ent_x2_train, ent_x2_dev = ent_x2_shuffled[:dev_idx], ent_x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev))) else: x1_train, x2_train, ent_x1_train, ent_x2_train, y_train, add_fea_train = x1_shuffled,x2_shuffled,ent_x1_shuffled,ent_x2_shuffled,y_shuffled,add_fea_shuffled x1_dev_text, x2_dev_text, y_dev = self.getTsvData(dev_paths) ent_x1_dev, ent_x2_dev=self.getEntData(x1_dev_text, x2_dev_text, dev_labeled_paths, max_document_length) add_fea_dev = self.getAdditionalFeature(x1_dev_text,x2_dev_text) x1_dev = np.asarray(list(vocab_processor.transform(x1_dev_text))) x2_dev = np.asarray(list(vocab_processor.transform(x2_dev_text))) sum_no_of_batches = sum_no_of_batches+(len(y_train)//batch_size) train_set=(x1_train, x2_train, ent_x1_train, ent_x2_train, y_train,add_fea_train) dev_set=(x1_dev,x2_dev,ent_x1_dev,ent_x2_dev,y_dev,add_fea_dev,x1_dev_text,x2_dev_text) gc.collect() return train_set,dev_set,vocab_processor,sum_no_of_batches
def myGetDataSets(self, cursor, max_document_length, percent_dev, batch_size, is_char_based, number_of_samples): # edited start_time = time.time() cursor.execute('select * from dataset_sentence') end_time = time.time() print('Time elapsed on running select all: {} seconds.'.format( round(end_time - start_time, 2))) start_time = time.time() tuples = cursor.fetchmany(number_of_samples) end_time = time.time() print('Time elapsed on fetching {} lines: {} seconds.'.format( number_of_samples, round(end_time - start_time, 2))) x1_text = np.asarray([i[0] for i in tuples]) x2_text = np.asarray([i[1] for i in tuples]) y = np.asarray([i[2] for i in tuples]) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0, is_char_based=is_char_based) vocab_processor.fit_transform( np.concatenate((x2_text, x1_text), axis=0)) print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) i1 = 0 train_set = [] dev_set = [] sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format( 'dataset_id', len(y_train), len(y_dev))) sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches
def getAquaintTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp,x2_temp,y = self.getAquaintData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print (len(vocab_processor.vocabulary_)) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2, y
def getPCADataSet(self, data_path, vocab_path, max_document_length): x1_temp = self.getJsonPCAData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print len(vocab_processor.vocabulary_) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x1, np.ones(len(x1))
def getTestDataSet_infer(self, x1_infer, x2_infer, vocab_path, max_document_length): x1_temp,x2_temp = np.asarray(x1_infer), np.asarray(x2_infer) #, = self.getTsvTestData_infer(x1_infer, x2_infer) #print('DAS ist x1_temp: ', type(x1_temp), x1_temp) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print ('len vocab: ', len(vocab_processor.vocabulary_)) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2
def getTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp,x2_temp,y = self.getTsvTestData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print len(vocab_processor.vocabulary_) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2, y
def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size): x1_text, x2_text, y = self.getTsvData(training_paths) # print('x1_text= {}'.format(x1_text)) # print('x2_text= {}'.format(x2_text)) # print ('y= {}'.format(y)) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor.fit_transform( np.concatenate((x2_text, x1_text), axis=0)) print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 print('dev_idx= {}'.format(dev_idx)) # for idx, item in enumerate(y_shuffled): # print('idx={}\n x1_shuffled={}\n x2_shuffled={}\n y_shuffled={}'.format(idx, x1_shuffled[idx], x2_shuffled[idx], y_shuffled[idx])) # # exit(0) del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format( training_paths, len(y_train), len(y_dev))) sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches
def getTestDataSet(self, data_path, ent_path, vocab_path, max_document_length): x1_temp,x2_temp,y = self.getTsvTestData(data_path) ent_x1,ent_x2=self.getEntData(x1_temp,x2_temp, ent_path, max_document_length) add_fea_test = self.getAdditionalFeature(x1_temp,x2_temp) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length,min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print len(vocab_processor.vocabulary_) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1,x2,ent_x1,ent_x2, y, x1_temp, x2_temp,add_fea_test
def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size): #Tao: x1_text, x2_text, y all are 1-D np arrays x1_text, x2_text, y = self.getTsvData(training_paths) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) #Tao: vocab_processor.fit_transform( np.concatenate((x2_text, x1_text), axis=0)) print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) i1 = 0 train_set = [] dev_set = [] sum_no_of_batches = 0 #Tao: x1 and x2 are both 2-D arrays x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format( training_paths, len(y_train), len(y_dev))) #Tao: #root@docker:/opt/siamese_nn/deep-siamese-text-similarity-master# wc -l person_match.train2 #13198 person_match.train2 #Train/Dev split for person_match.train2: 35634/3960 -->13198 * 3=39594 35634+3960=39594 sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches
def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size, is_char_based): if is_char_based: x1_text, x2_text, y = self.getTsvDataCharBased(training_paths) else: x1_text, x2_text, y = self.getTsvData(training_paths) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0, is_char_based=is_char_based) # 数组的合并后做,fit_transform()的作用就是先拟合数据,然后转化它将其转化为标准形式 vocab_processor.fit_transform( np.concatenate((x2_text, x1_text), axis=0)) print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) i1 = 0 train_set = [] dev_set = [] sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) #它会返回一个洗牌后的矩阵副本 x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format( training_paths, len(y_train), len(y_dev))) sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches
def getDataSets(self, data, max_document_length, percent_dev, batch_size): x1_text, x2_text, y = self.getTsvData(data) # Build vocabulary logger.info("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor.fit_transform( np.concatenate((x2_text, x1_text), axis=0)) logger.info("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) train_set = [] dev_set = [] sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data pos_rate = 0.165 pos_num = y.sum() neg_num = y.shape[0] - y.sum() logger.info('pos_rate: %s, target pos_rate: %s, pos_num: %s' % (pos_num / y.shape[0], pos_rate, pos_num)) w = (neg_num * pos_rate) / (pos_num * (1 - pos_rate)) sample_weight = np.where(y == 1, w, 1) calc_pos_rate = (w * pos_num) / (w * pos_num + neg_num) logger.info('calc pos_rate: %s' % calc_pos_rate) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) for train_idx, test_idx in cv.split(x1, y): break x1_train, x1_dev = x1[train_idx], x1[test_idx] x2_train, x2_dev = x2[train_idx], x2[test_idx] y_train, y_dev = y[train_idx], y[test_idx] sample_weight = (sample_weight[train_idx], sample_weight[test_idx]) sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches, sample_weight
def getWords(self, word1, word2, vocab_path, max_document_length): temp1 = [] temp2 = [] temp1.append(word1.lower()) temp2.append(word2.lower()) x1_temp = np.asarray(temp1) x2_temp = np.asarray(temp2) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1, x2, np.asarray(-1)
def getEmbeddingsMap(self, cursor, max_document_length, num_docs): print('Loading sentences') # print('Memory (before): {}Mb'.format(mem_profile.memory_usage())) ids, sentences = map( list, zip(*datagen.get_sentences_list(cursor, num_docs))) # print('Memory (after): {}Mb\n'.format(mem_profile.memory_usage())) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0, is_char_based=False) #sentences_array = np.asarray(sentences) # line in which memory error occurs with full list of datasets (size = 6620242) # print('Memory (before): {}Mb'.format(mem_profile.memory_usage())) start_time = time.time() vocab_processor.fit_transform(sentences) end_time = time.time() print( 'Time elapsed on vocabulary fitting (fit_transform): {} seconds.'. format(round(end_time - start_time, 2))) # print('Memory (after): {}Mb'.format(mem_profile.memory_usage())) print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) print('Vocabulary created!\n') # print('Memory (before): {}Mb'.format(mem_profile.memory_usage())) start_time = time.time() embeddings = np.asarray(list(vocab_processor.transform(sentences))) end_time = time.time() print('Time elapsed on sentences to word ids (transform): {} seconds.'. format(round(end_time - start_time, 2))) # print('Memory (after): {}Mb\n'.format(mem_profile.memory_usage())) print('Embeddings generated in memory!') gc.collect() return dict(zip(ids, embeddings)), vocab_processor
def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size): x1_text, x2_text, y = self.getTsvData(training_paths) # print('x1_text= {}'.format(x1_text)) # print('x2_text= {}'.format(x2_text)) # print ('y= {}'.format(y)) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor.fit_transform(np.concatenate((x2_text, x1_text), axis=0)) print("Length of loaded vocabulary ={}".format(len(vocab_processor.vocabulary_))) sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 print('dev_idx= {}'.format(dev_idx)) del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format(training_paths, len(y_train), len(y_dev))) sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches
def toVocabularyIndexVector(self, datax1, datax2, vocab_path, max_document_length): """ Transform the word list to vocabulary_index vectors :param datax1: :param datax2: :param vocab_path: :param max_document_length: :return: """ # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) print(len(vocab_processor.vocabulary_)) datax1 = preprocess_arr(datax1) datax2 = preprocess_arr(datax2) x1 = np.asarray(list(vocab_processor.transform(datax1))) x2 = np.asarray(list(vocab_processor.transform(datax2))) # Randomly shuffle data del vocab_processor gc.collect() return x1, x2
class InputHelper(): def __init__(self, data_dir, input_file, batch_size, sequence_length, is_train=True): self.data_dir = data_dir self.batch_size = batch_size self.sequence_length = sequence_length vocab_file = os.path.join(data_dir, 'useWords.model.vec') # vocab_file = os.path.join(data_dir, 'vocab.pkl') input_file = os.path.join(data_dir, input_file) if not (os.path.exists(vocab_file)): print 'readling train file' self.preprocess(input_file, vocab_file) else: print 'loading vocab file' self.load_vocab(vocab_file) if is_train: self.create_batches(input_file) self.reset_batch() def preprocess(self, input_file, vocab_file, min_freq=2): token_freq = defaultdict(int) for line in open(input_file): seq1, seq2, label = line.rstrip().split('\t') seq = seq1 + ' ' + seq2 for token in seq.split(' '): token_freq[token] += 1 token_list = [ w for w in token_freq.keys() if token_freq[w] >= min_freq ] token_list.append('<pad>') token_dict = {token: index for index, token in enumerate(token_list)} with open(vocab_file, 'w') as f: cPickle.dump(token_dict, f) self.token_dictionary = token_dict self.vocab_size = len(self.token_dictionary) def load_vocab(self, vocab_file): self.token_dictionary = dict() for line in open(vocab_file): l = line.strip().split() st = l[0].decode('utf-8') self.token_dictionary[st] = np.asarray(l[1:]) # self.vocab_size = len(self.token_dictionary) # with open(vocab_file, 'rb') as f: # self.token_dictionary = cPickle.load(f) # self.vocab_size = len(self.token_dictionary) def text_to_array(self, text, is_clip=True): words = [w for w in jieba.cut(text[0]) if w.strip()] words1 = [ TOKENIZER_RE.findall(w)[0] for w in words if TOKENIZER_RE.findall(w) ] if is_clip: words1 = words1[:self.sequence_length] return words1 # seq_ids = [int(self.token_dictionary.get(token)) for token in text if # self.token_dictionary.get(token) is not None] # if is_clip: # seq_ids = seq_ids[:self.sequence_length] # return seq_ids def getTsvData(self, filepath): print("Loading training data from " + filepath) x1 = [] x2 = [] y = [] # positive samples from file for line in open(filepath): l = line.strip().split("\t") if len(l) < 3: continue if random() > 0.5: x1.append(l[1]) x2.append(l[2]) else: x1.append(l[2]) x2.append(l[1]) y.append(int(l[3])) return np.asarray(x1), np.asarray(x2), np.asarray(y) def getTsvTestData(self, filepath): print("Loading testing/labelled data from " + filepath) x1 = [] x2 = [] # positive samples from file for line in open(filepath): l = line.strip().split("\t") if len(l) < 2: continue x1.append(l[1]) x2.append(l[2]) return np.asarray(x1), np.asarray(x2) def getTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp, x2_temp = self.getTsvTestData(data_path) # Build vocabulary self.vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) self.vocab_processor = self.vocab_processor.restore(vocab_path) print len(self.vocab_processor.vocabulary_) x1 = np.asarray(list(self.vocab_processor.transform(x1_temp))) x2 = np.asarray(list(self.vocab_processor.transform(x2_temp))) # Randomly shuffle data return x1, x2 def batch_iter(self, data, batch_size, num_epochs, shuffle=True): """ Generates a batch iterator for a dataset. """ data = np.asarray(data) print(data) print(data.shape) data_size = len(data) num_batches_per_epoch = int(len(data) / batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index] def padding_seq(self, seq_array, padding_index): for i in xrange(len(seq_array), self.sequence_length): seq_array.append(padding_index) def create_batches(self, text_file): x1 = [] x2 = [] y = [] seq1_array = [] seq2_array = [] # padding_index = self.vocab_size - 1 for line in open(text_file): _, seq1, seq2, label = line.rstrip().split('\t') # seq1_array = self.text_to_array(seq1.decode('utf-8').split(' ')) # seq2_array = self.text_to_array(seq2.decode('utf-8').split(' ')) # self.padding_seq(seq1_array, padding_index) # self.padding_seq(seq2_array, padding_index) label = int(label) x1.append(seq1) x2.append(seq2) y.append(label) self.vocab_processor = MyVocabularyProcessor(self.sequence_length, min_frequency=0) self.vocab_processor.fit_transform(np.concatenate((x2, x1), axis=0)) x1_1 = np.asarray(list(self.vocab_processor.transform(x1))) x2_1 = np.asarray(list(self.vocab_processor.transform(x2))) # x1 = np.array(x1) # x2 = np.array(x2) y = np.array(y) self.num_samples = len(y) self.num_batches = self.num_samples / self.batch_size indices = np.random.permutation(self.num_samples) self.x1 = x1_1[indices] self.x2 = x2_1[indices] self.y = y[indices] def next_batch(self): begin = self.pointer end = self.pointer + self.batch_size x1_batch = self.x1[begin:end] x2_batch = self.x2[begin:end] y_batch = self.y[begin:end] new_pointer = self.pointer + self.batch_size if new_pointer >= self.num_samples: self.eos = True else: self.pointer = new_pointer return x1_batch, x2_batch, y_batch def reset_batch(self): self.pointer = 0 self.eos = False