def __init__(self, options): self.options = options training_dir = os.path.join(options.data_dir, constants.TRAIN_FOLDER_NAME) validation_dir = os.path.join(options.data_dir, constants.DEV_FOLDER_NAME) self.vocab = get_vocab(options.data_dir) self.train_ds = _SquadDataset(training_dir, options, self.vocab) self.dev_ds = _SquadDataset(validation_dir, options, self.vocab) print( "%d train files batches, %d dev" % (len(self.train_ds.context_files), len(self.dev_ds.context_files))) self.handle = tf.placeholder(tf.string, shape=[]) self.iterator = tf.contrib.data.Iterator.from_string_handle( self.handle, self.train_ds.zip_ds.output_types, self.train_ds.zip_ds.output_shapes) self.embeddings = embedding_util.load_word_embeddings_including_unk_and_padding( options) self.word_chars = embedding_util.load_word_char_embeddings_including_unk_and_padding( options) self.word_vec_size = constants.WORD_VEC_DIM self.max_word_len = constants.MAX_WORD_LEN
def create_train_data(self): train_folder = os.path.join(self.data_dir, constants.TRAIN_FOLDER_NAME) dev_folder = os.path.join(self.data_dir, constants.DEV_FOLDER_NAME) train_files_wrapper = DatasetFilesWrapper(train_folder) dev_files_wrapper = DatasetFilesWrapper(dev_folder) if all([len(os.listdir(f)) > 0 for f in [train_folder, dev_folder]]): print("Train & dev data already exist.") return print("Getting vocabulary") self.vocab = get_vocab(self.data_dir) print("Finished getting vocabulary") self.nlp = spacy.load("en") print("Getting DEV dataset") dev_raw_data = self._create_train_data_internal( constants.DEV_SQUAD_FILE, is_dev=True) print("Getting TRAIN dataset") train_raw_data = self._create_train_data_internal( constants.TRAIN_SQUAD_FILE, is_dev=False) print("Num NER categories", self.ner_categories.get_num_categories()) print("Num POS categories", self.pos_categories.get_num_categories()) max_context_length = max( max([len(x) for x in train_raw_data.list_contexts]), max([len(x) for x in dev_raw_data.list_contexts])) max_question_length = max( max([len(x) for x in train_raw_data.list_questions]), max([len(x) for x in dev_raw_data.list_questions])) print("Saving TRAIN data") train_file_saver = DatasetFilesSaver(train_files_wrapper, max_context_length, max_question_length, self.vocab, train_raw_data) train_file_saver.save() print("Saving DEV data") dev_file_saver = DatasetFilesSaver(dev_files_wrapper, max_context_length, max_question_length, self.vocab, dev_raw_data) dev_file_saver.save() print("Finished creating training data!")
def save_cove_weights(options): """Saves the weights of the CoVe LSTM for manual TensorFlow initialization. """ folder_name = os.path.join(options.data_dir, constants.COVE_WEIGHTS_FOLDER) if all([os.path.exists(os.path.join(folder_name, name + ".npy")) \ for name in constants.COVE_WEIGHT_NAMES]): print("Cove weights already saved") return os.makedirs(folder_name, exist_ok=True) vocab = get_vocab(options.data_dir) embeddings = embedding_util.load_word_embeddings_including_unk_and_padding( options) vec_size = 2 * embeddings.shape[1] print("Loading CoVe model") model = MTLSTM(n_vocab=embeddings.shape[0], vectors=torch.from_numpy(embeddings.astype(np.float32))) print("Saving CoVe weights") for weight_name in constants.COVE_WEIGHT_NAMES: tensor = getattr(model.rnn, weight_name) np_value = tensor.cpu().data.numpy() full_file_name = os.path.join(folder_name, weight_name + ".npy") np.save(full_file_name, np_value)
def __init__(self, options): self.vocab = get_vocab(options.data_dir) vocab_size = self.vocab.get_vocab_size_without_pad_or_unk() self.embeddings = np.random.uniform(-1.0, 1.0, size=(vocab_size + 2, _WORD_DIM)) self.word_vec_size = _WORD_DIM self.max_word_len = _WORD_LEN self.text_tokens = [ [self.vocab.get_word_for_id(np.random.randint(0, vocab_size)) \ for _ in range(_CTX_LEN)] \ for _ in range(_NUM_SAMPLES_PER_FILE) ] self.ctx = np.random.randint(0, vocab_size, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN)) self.qst = np.random.randint(0, vocab_size, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN)) self.spn = np.zeros((_NUM_SAMPLES_PER_FILE, 2), dtype=np.int32) for z in range(_NUM_SAMPLES_PER_FILE): spns = sorted([np.random.randint(0, _CTX_LEN), np.random.randint(0, _CTX_LEN)]) self.spn[z, 0] = spns[0] self.spn[z, 1] = spns[1] self.data_index = np.arange(self.ctx.shape[0]) self.word_in_question = np.random.randint(0, 2, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN)) self.word_in_context = np.random.randint(0, 2, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN)) self.question_ids = self.data_index self.context_pos = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN), dtype=np.int8) self.question_pos = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN), dtype=np.int8) self.context_ner = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _CTX_LEN), dtype=np.int8) self.question_ner = np.random.randint(0, 2**7, size=(_NUM_SAMPLES_PER_FILE, _QST_LEN), dtype=np.int8) self.word_chars = np.random.randint(0, 2**8 - 2, size=(vocab_size + 2, _WORD_LEN), dtype=np.uint8) self.ctx_ds = tf.contrib.data.Dataset.from_tensor_slices(self.ctx) self.qst_ds = tf.contrib.data.Dataset.from_tensor_slices(self.qst) self.spn_ds = tf.contrib.data.Dataset.from_tensor_slices(self.spn) self.data_index_ds = tf.contrib.data.Dataset.from_tensor_slices(self.data_index) self.word_in_question_ds = tf.contrib.data.Dataset.from_tensor_slices(self.word_in_question) self.word_in_context_ds = tf.contrib.data.Dataset.from_tensor_slices(self.word_in_context) self.question_ids_ds = tf.contrib.data.Dataset.from_tensor_slices(self.question_ids) self.context_pos_ds = tf.contrib.data.Dataset.from_tensor_slices(self.context_pos) self.question_pos_ds = tf.contrib.data.Dataset.from_tensor_slices(self.question_pos) self.context_ner_ds = tf.contrib.data.Dataset.from_tensor_slices(self.context_ner) self.question_ner_ds = tf.contrib.data.Dataset.from_tensor_slices(self.question_ner) self.zip_ds = tf.data.Dataset.zip({ _CONTEXT_KEY: self.ctx_ds, _QUESTION_KEY: self.qst_ds, _SPAN_KEY: self.spn_ds, _WORD_IN_QUESTION_KEY: self.word_in_question_ds, _WORD_IN_CONTEXT_KEY: self.word_in_context_ds, _QUESTION_IDS_KEY: self.question_ids_ds, _CONTEXT_POS_KEY: self.context_pos_ds, _CONTEXT_NER_KEY: self.context_ner_ds, _QUESTION_POS_KEY: self.question_pos_ds, _QUESTION_NER_KEY: self.question_ner_ds, }) \ .batch(options.batch_size) \ .repeat() \ .shuffle(buffer_size=10) self.zip_iterator = self.zip_ds.make_initializable_iterator() self.train_handle = None self.val_handle = None self.handle = tf.placeholder(tf.string, shape=[], name="data_handle") self.iterator = tf.contrib.data.Iterator.from_string_handle( self.handle, self.zip_ds.output_types, self.zip_ds.output_shapes) self.total_samples_processed = 0 self.train_ds = _TestDataset(self) self.dev_ds = self.train_ds