def getVocab(self, vocab_path, max_document_length, filter_h_pad): if self.vocab_processor is None: print('locading vocab') vocab_processor = MyVocabularyProcessor(max_document_length - filter_h_pad, min_frequency=0) self.vocab_processor = vocab_processor.restore(vocab_path) return self.vocab_processor
def getTestDataSet(self, data_path, vocab_path, max_document_length): x1_temp, x2_temp, y = self.getTsvTestData(data_path) # Build vocabulary vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor = vocab_processor.restore(vocab_path) f = open("./vocab", "w", encoding="utf-8") for i in range(len(vocab_processor.vocabulary_)): f.write(vocab_processor.vocabulary_.reverse(i) + "\n") x1 = np.asarray(list(vocab_processor.transform(x1_temp))) x2 = np.asarray(list(vocab_processor.transform(x2_temp))) # Randomly shuffle data del vocab_processor gc.collect() return x1, x2, y
def getDataSets(self, training_paths, max_document_length, percent_dev, batch_size): x1_text, x2_text, y = self.getTsvDataCharBased(training_paths) # Build vocabulary print("Building vocabulary") vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor.fit_transform( np.concatenate((x2_text, x1_text), axis=0)) # f = open("./voc", "w",encoding="utf-8") # for i in range(len(vocab_processor.vocabulary_)): # f.write(vocab_processor.vocabulary_.reverse(i)+"\n") print("Length of loaded vocabulary ={}".format( len(vocab_processor.vocabulary_))) sum_no_of_batches = 0 x1 = np.asarray(list(vocab_processor.transform(x1_text))) x2 = np.asarray(list(vocab_processor.transform(x2_text))) # Randomly shuffle data np.random.seed(131) shuffle_indices = np.random.permutation(np.arange(len(y))) x1_shuffled = x1[shuffle_indices] x2_shuffled = x2[shuffle_indices] y_shuffled = y[shuffle_indices] dev_idx = -1 * len(y_shuffled) * percent_dev // 100 del x1 del x2 # Split train/test set self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0) # TODO: This is very crude, should use cross-validation x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:] x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:] y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:] print("Train/Dev split for {}: {:d}/{:d}".format( training_paths, len(y_train), len(y_dev))) sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size) train_set = (x1_train, x2_train, y_train) dev_set = (x1_dev, x2_dev, y_dev) gc.collect() return train_set, dev_set, vocab_processor, sum_no_of_batches