示例#1
0
 def getVocab(self, vocab_path, max_document_length, filter_h_pad):
     if self.vocab_processor is None:
         print('locading vocab')
         vocab_processor = MyVocabularyProcessor(max_document_length -
                                                 filter_h_pad,
                                                 min_frequency=0)
         self.vocab_processor = vocab_processor.restore(vocab_path)
     return self.vocab_processor
示例#2
0
    def getTestDataSet(self, data_path, vocab_path, max_document_length):
        x1_temp, x2_temp, y = self.getTsvTestData(data_path)
        # Build vocabulary
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_path)
        f = open("./vocab", "w", encoding="utf-8")
        for i in range(len(vocab_processor.vocabulary_)):
            f.write(vocab_processor.vocabulary_.reverse(i) + "\n")

        x1 = np.asarray(list(vocab_processor.transform(x1_temp)))
        x2 = np.asarray(list(vocab_processor.transform(x2_temp)))
        # Randomly shuffle data
        del vocab_processor
        gc.collect()
        return x1, x2, y
示例#3
0
 def getDataSets(self, training_paths, max_document_length, percent_dev,
                 batch_size):
     x1_text, x2_text, y = self.getTsvDataCharBased(training_paths)
     # Build vocabulary
     print("Building vocabulary")
     vocab_processor = MyVocabularyProcessor(max_document_length,
                                             min_frequency=0)
     vocab_processor.fit_transform(
         np.concatenate((x2_text, x1_text), axis=0))
     # f = open("./voc", "w",encoding="utf-8")
     # for i in range(len(vocab_processor.vocabulary_)):
     #     f.write(vocab_processor.vocabulary_.reverse(i)+"\n")
     print("Length of loaded vocabulary ={}".format(
         len(vocab_processor.vocabulary_)))
     sum_no_of_batches = 0
     x1 = np.asarray(list(vocab_processor.transform(x1_text)))
     x2 = np.asarray(list(vocab_processor.transform(x2_text)))
     # Randomly shuffle data
     np.random.seed(131)
     shuffle_indices = np.random.permutation(np.arange(len(y)))
     x1_shuffled = x1[shuffle_indices]
     x2_shuffled = x2[shuffle_indices]
     y_shuffled = y[shuffle_indices]
     dev_idx = -1 * len(y_shuffled) * percent_dev // 100
     del x1
     del x2
     # Split train/test set
     self.dumpValidation(x1_text, x2_text, y, shuffle_indices, dev_idx, 0)
     # TODO: This is very crude, should use cross-validation
     x1_train, x1_dev = x1_shuffled[:dev_idx], x1_shuffled[dev_idx:]
     x2_train, x2_dev = x2_shuffled[:dev_idx], x2_shuffled[dev_idx:]
     y_train, y_dev = y_shuffled[:dev_idx], y_shuffled[dev_idx:]
     print("Train/Dev split for {}: {:d}/{:d}".format(
         training_paths, len(y_train), len(y_dev)))
     sum_no_of_batches = sum_no_of_batches + (len(y_train) // batch_size)
     train_set = (x1_train, x2_train, y_train)
     dev_set = (x1_dev, x2_dev, y_dev)
     gc.collect()
     return train_set, dev_set, vocab_processor, sum_no_of_batches