Пример #1
0
    def __init__(self, options):
        self.options = options
        training_dir = os.path.join(options.data_dir,
                                    constants.TRAIN_FOLDER_NAME)
        validation_dir = os.path.join(options.data_dir,
                                      constants.DEV_FOLDER_NAME)
        self.vocab = get_vocab(options.data_dir)

        self.train_ds = _SquadDataset(training_dir, options, self.vocab)
        self.dev_ds = _SquadDataset(validation_dir, options, self.vocab)
        print(
            "%d train files batches, %d dev" %
            (len(self.train_ds.context_files), len(self.dev_ds.context_files)))

        self.handle = tf.placeholder(tf.string, shape=[])
        self.iterator = tf.contrib.data.Iterator.from_string_handle(
            self.handle, self.train_ds.zip_ds.output_types,
            self.train_ds.zip_ds.output_shapes)

        self.embeddings = embedding_util.load_word_embeddings_including_unk_and_padding(
            options)
        self.word_chars = embedding_util.load_word_char_embeddings_including_unk_and_padding(
            options)

        self.word_vec_size = constants.WORD_VEC_DIM
        self.max_word_len = constants.MAX_WORD_LEN
Пример #2
0
    def create_train_data(self):
        train_folder = os.path.join(self.data_dir, constants.TRAIN_FOLDER_NAME)
        dev_folder = os.path.join(self.data_dir, constants.DEV_FOLDER_NAME)
        train_files_wrapper = DatasetFilesWrapper(train_folder)
        dev_files_wrapper = DatasetFilesWrapper(dev_folder)

        if all([len(os.listdir(f)) > 0 for f in [train_folder, dev_folder]]):
            print("Train & dev data already exist.")
            return

        print("Getting vocabulary")
        self.vocab = get_vocab(self.data_dir)
        print("Finished getting vocabulary")
        self.nlp = spacy.load("en")
        print("Getting DEV dataset")
        dev_raw_data = self._create_train_data_internal(
            constants.DEV_SQUAD_FILE, is_dev=True)
        print("Getting TRAIN dataset")
        train_raw_data = self._create_train_data_internal(
            constants.TRAIN_SQUAD_FILE, is_dev=False)
        print("Num NER categories", self.ner_categories.get_num_categories())
        print("Num POS categories", self.pos_categories.get_num_categories())

        max_context_length = max(
            max([len(x) for x in train_raw_data.list_contexts]),
            max([len(x) for x in dev_raw_data.list_contexts]))

        max_question_length = max(
            max([len(x) for x in train_raw_data.list_questions]),
            max([len(x) for x in dev_raw_data.list_questions]))

        print("Saving TRAIN data")
        train_file_saver = DatasetFilesSaver(train_files_wrapper,
                                             max_context_length,
                                             max_question_length, self.vocab,
                                             train_raw_data)
        train_file_saver.save()

        print("Saving DEV data")
        dev_file_saver = DatasetFilesSaver(dev_files_wrapper,
                                           max_context_length,
                                           max_question_length, self.vocab,
                                           dev_raw_data)
        dev_file_saver.save()

        print("Finished creating training data!")
Пример #3
0
def save_cove_weights(options):
    """Saves the weights of the CoVe LSTM for manual TensorFlow initialization.
    """
    folder_name = os.path.join(options.data_dir, constants.COVE_WEIGHTS_FOLDER)
    if all([os.path.exists(os.path.join(folder_name, name + ".npy")) \
        for name in constants.COVE_WEIGHT_NAMES]):
        print("Cove weights already saved")
        return
    os.makedirs(folder_name, exist_ok=True)
    vocab = get_vocab(options.data_dir)
    embeddings = embedding_util.load_word_embeddings_including_unk_and_padding(
        options)
    vec_size = 2 * embeddings.shape[1]
    print("Loading CoVe model")
    model = MTLSTM(n_vocab=embeddings.shape[0],
                   vectors=torch.from_numpy(embeddings.astype(np.float32)))
    print("Saving CoVe weights")
    for weight_name in constants.COVE_WEIGHT_NAMES:
        tensor = getattr(model.rnn, weight_name)
        np_value = tensor.cpu().data.numpy()
        full_file_name = os.path.join(folder_name, weight_name + ".npy")
        np.save(full_file_name, np_value)
Пример #4
0
    def __init__(self, options):
        self.vocab = get_vocab(options.data_dir)
        vocab_size = self.vocab.get_vocab_size_without_pad_or_unk()
        self.embeddings = np.random.uniform(-1.0, 1.0,
            size=(vocab_size + 2, _WORD_DIM))
        self.word_vec_size = _WORD_DIM
        self.max_word_len = _WORD_LEN
        self.text_tokens = [
            [self.vocab.get_word_for_id(np.random.randint(0, vocab_size)) \
                for _ in range(_CTX_LEN)] \
            for _ in range(_NUM_SAMPLES_PER_FILE) ]


        self.ctx = np.random.randint(0, vocab_size, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN))
        self.qst = np.random.randint(0, vocab_size, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN))
        self.spn = np.zeros((_NUM_SAMPLES_PER_FILE, 2), dtype=np.int32)
        for z in range(_NUM_SAMPLES_PER_FILE):
            spns = sorted([np.random.randint(0, _CTX_LEN),
                           np.random.randint(0, _CTX_LEN)])
            self.spn[z, 0] = spns[0]
            self.spn[z, 1] = spns[1]
        self.data_index = np.arange(self.ctx.shape[0])
        self.word_in_question = np.random.randint(0, 2, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN))
        self.word_in_context = np.random.randint(0, 2, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN))
        self.question_ids = self.data_index
        self.context_pos  = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN), dtype=np.int8)
        self.question_pos = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN), dtype=np.int8)
        self.context_ner  = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN), dtype=np.int8)
        self.question_ner = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN), dtype=np.int8)
        self.word_chars = np.random.randint(0, 2**8 - 2, size=(vocab_size + 2, _WORD_LEN), dtype=np.uint8)


        self.ctx_ds = tf.contrib.data.Dataset.from_tensor_slices(self.ctx)
        self.qst_ds = tf.contrib.data.Dataset.from_tensor_slices(self.qst)
        self.spn_ds = tf.contrib.data.Dataset.from_tensor_slices(self.spn)
        self.data_index_ds = tf.contrib.data.Dataset.from_tensor_slices(self.data_index)
        self.word_in_question_ds = tf.contrib.data.Dataset.from_tensor_slices(self.word_in_question)
        self.word_in_context_ds = tf.contrib.data.Dataset.from_tensor_slices(self.word_in_context)
        self.question_ids_ds = tf.contrib.data.Dataset.from_tensor_slices(self.question_ids)
        self.context_pos_ds = tf.contrib.data.Dataset.from_tensor_slices(self.context_pos)
        self.question_pos_ds = tf.contrib.data.Dataset.from_tensor_slices(self.question_pos)
        self.context_ner_ds = tf.contrib.data.Dataset.from_tensor_slices(self.context_ner)
        self.question_ner_ds = tf.contrib.data.Dataset.from_tensor_slices(self.question_ner)

        self.zip_ds = tf.data.Dataset.zip({
            _CONTEXT_KEY: self.ctx_ds,
            _QUESTION_KEY: self.qst_ds,
            _SPAN_KEY: self.spn_ds,
            _WORD_IN_QUESTION_KEY: self.word_in_question_ds,
            _WORD_IN_CONTEXT_KEY: self.word_in_context_ds,
            _QUESTION_IDS_KEY: self.question_ids_ds,
            _CONTEXT_POS_KEY: self.context_pos_ds,
            _CONTEXT_NER_KEY: self.context_ner_ds,
            _QUESTION_POS_KEY: self.question_pos_ds,
            _QUESTION_NER_KEY: self.question_ner_ds,
        }) \
        .batch(options.batch_size) \
        .repeat() \
        .shuffle(buffer_size=10)
        self.zip_iterator = self.zip_ds.make_initializable_iterator()

        self.train_handle = None
        self.val_handle = None
        self.handle = tf.placeholder(tf.string, shape=[], name="data_handle")
        self.iterator = tf.contrib.data.Iterator.from_string_handle(
            self.handle, self.zip_ds.output_types,
            self.zip_ds.output_shapes)
        self.total_samples_processed = 0

        self.train_ds = _TestDataset(self)
        self.dev_ds = self.train_ds