def train(self, training_data): x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data ) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text] for row in x_train: if len(row) == 0: row.append(embedding_lookup['UNK']) x_train = np.array(x_train) y_train = np.array(y_train) mems_train = load_memories([' '.join(x) for x in x_train_text], self.n_memories) mems_indices_train = memories_to_indices(mems_train, embedding_lookup) x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text] for row in x_test: if len(row) == 0: row.append(embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.array(y_test) mems_test = load_memories([' '.join(x) for x in x_test_text], self.n_memories) mem_indices_test = memories_to_indices(mems_test, embedding_lookup) self.n_classes = compute_n_classes(training_data[1])
def train(self, training_data: TrainingData) -> None: log.info('Preprocessing training data...') x_train, y_train, x_test, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data, create_runs=self.train_on_q_runs, full_question=self.train_on_full_q) if self.wiki_data_frac > 0: log.info('Using wikipedia with fraction: {}'.format(self.wiki_data_frac)) wiki_data = FilteredWikipediaDataset().training_data() results = preprocess_dataset( wiki_data, train_size=1, vocab=vocab, class_to_i=class_to_i, i_to_class=i_to_class) x_train.extend(results[0]) y_train.extend(results[1]) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab log.info('Creating embeddings...') embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=self.expand_we, mask_zero=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup log.info('Converting dataset to embeddings...') x_train = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train] x_test = [nn.convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test] self.n_classes = nn.compute_n_classes(training_data[1]) self.max_len = nn.compute_max_len(training_data) x_train = np.array(nn.tf_format(x_train, self.max_len, 0)) x_test = np.array(nn.tf_format(x_test, self.max_len, 0)) log.info('Building keras model...') self.model = self.build_model() log.info('Training model...') callbacks = [ TensorBoard(), EarlyStopping(patience=self.max_patience, monitor='val_sparse_categorical_accuracy'), ModelCheckpoint( safe_path(CNN_MODEL_TMP_TARGET), save_best_only=True, monitor='val_sparse_categorical_accuracy' ) ] if self.decay_lr_on_plateau: callbacks.append(ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy', factor=.5, patience=5)) history = self.model.fit( x_train, y_train, validation_data=(x_test, y_test), batch_size=self.batch_size, epochs=self.max_n_epochs, callbacks=callbacks, verbose=2 ) self.history = history.history log.info('Done training')
def guess(self, questions, max_n_guesses): x_test = [convert_text_to_embeddings_indices( tokenize_question(q), self.embedding_lookup) for q in questions ] for r in x_test: if len(r) == 0: log.warn('Found an empty question, adding an UNK token to it so that NaNs do not occur') r.append(self.embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.zeros(len(x_test)) _, t_x_batches, lengths, masks, t_y_batches = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False ) self.model.eval() self.model.cuda() guesses = [] for b in range(len(t_x_batches)): t_x = Variable(t_x_batches[b], volatile=True) length_batch = lengths[b] mask_batch = masks[b] probs = self.model(t_x, length_batch, mask_batch) scores, preds = torch.max(probs, 1) scores = scores.data.cpu().numpy() preds = preds.data.cpu().numpy() for p, s in zip(preds, scores): guesses.append([(self.i_to_class[p], s)]) return guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]: x_test = [convert_text_to_embeddings_indices( tokenize_question(q), self.embedding_lookup) for q in questions ] for r in x_test: if len(r) == 0: log.warn('Found an empty question, adding an UNK token to it so that NaNs do not occur') r.append(self.embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.zeros(len(x_test)) _, t_x_batches, t_offset_batches, t_y_batches = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False ) self.model.eval() self.model.cuda() guesses = [] for b in range(len(t_x_batches)): t_x = Variable(t_x_batches[b], volatile=True) t_offset = Variable(t_offset_batches[b], volatile=True) out = self.model(t_x, t_offset) probs = F.softmax(out) scores, preds = torch.max(probs, 1) scores = scores.data.cpu().numpy() preds = preds.data.cpu().numpy() for p, s in zip(preds, scores): guesses.append([(self.i_to_class[p], s)]) return guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]: log.info('Generating {} guesses for each of {} questions'.format(max_n_guesses, len(questions))) x_test = [nn.convert_text_to_embeddings_indices( tokenize_question(q), self.embedding_lookup) for q in questions] x_test = np.array(nn.tf_format(x_test, self.max_len, 0)) class_probabilities = self.model.predict_proba(x_test, batch_size=self.batch_size) guesses = [] for row in class_probabilities: sorted_labels = np.argsort(-row)[:max_n_guesses] sorted_guesses = [self.i_to_class[i] for i in sorted_labels] sorted_scores = np.copy(row[sorted_labels]) guesses.append(list(zip(sorted_guesses, sorted_scores))) return guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]): x_test = [ convert_text_to_embeddings_indices(tokenize_question(q), self.embedding_lookup) for q in questions ] for r in x_test: if len(r) == 0: log.warn( 'Found an empty question, adding an UNK token to it so that NaNs do not occur' ) r.append(self.embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.zeros(len(x_test)) _, t_x_batches, lengths, t_y_batches, sort_batches = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False) self.model.eval() self.model.cuda() guesses = [] hidden = self.model.init_hidden(self.batch_size) for b in range(len(t_x_batches)): t_x = Variable(t_x_batches[b], volatile=True) length_batch = lengths[b] sort = sort_batches[b] if len(length_batch) != self.batch_size: # This could happen for the last batch which is shorter than batch_size hidden = self.model.init_hidden(len(length_batch)) else: hidden = repackage_hidden(hidden, reset=True) out, hidden = self.model(t_x, length_batch, hidden) probs = F.softmax(out) scores, preds = torch.max(probs, 1) scores = scores.data.cpu().numpy()[np.argsort(sort)] preds = preds.data.cpu().numpy()[np.argsort(sort)] for p, s in zip(preds, scores): guesses.append([(self.i_to_class[p], s)]) return guesses
def memories_to_indices(mems_list, embedding_lookup): all_key_indices = [] all_value_classes = [] all_scores = [] for row in mems_list: # Each row contains a list of memories/text row_keys = [] row_values = [] row_scores = [] for score, content, page in row: row_scores.append(score) # For each text in the row, convert it to embedding indices key_indices = convert_text_to_embeddings_indices(content, embedding_lookup) if len(key_indices) == 0: key_indices.append(embedding_lookup['UNK']) row_keys.append(key_indices) row_values.append(page) all_key_indices.append(row_keys) all_value_classes.append(row_values) all_scores.append(row_scores) return np.array(all_key_indices), all_value_classes, np.array(all_scores)
def train(self, training_data: TrainingData) -> None: if self.use_qb: x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data) if self.use_wiki: wiki_dataset = WikipediaDataset(set(training_data[1])) wiki_train_data = wiki_dataset.training_data() w_x_train_text, w_train_y, _, _, _, _, _ = preprocess_dataset( wiki_train_data, train_size=1, vocab=vocab, class_to_i=class_to_i, i_to_class=i_to_class) x_train_text.extend(w_x_train_text) y_train.extend(w_train_y) else: if self.use_wiki: wiki_dataset = WikipediaDataset(set(training_data[1])) wiki_train_data = wiki_dataset.training_data() x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( wiki_train_data) else: raise ValueError( 'use_wiki and use_qb cannot both be false, otherwise there is no training data' ) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup x_train = [ convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text ] for r in x_train: if len(r) == 0: r.append(embedding_lookup['UNK']) x_train = np.array(x_train) y_train = np.array(y_train) x_test = [ convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text ] for r in x_test: if len(r) == 0: r.append(embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.array(y_test) self.n_classes = compute_n_classes(training_data[1]) n_batches_train, t_x_train, t_offset_train, t_y_train = batchify( self.batch_size, x_train, y_train, truncate=True) n_batches_test, t_x_test, t_offset_test, t_y_test = batchify( self.batch_size, x_test, y_test, truncate=False) self.vocab_size = embeddings.shape[0] self.model = DanModel(self.vocab_size, self.n_classes, dropout_prob=self.dropout_prob, k_softmaxes=self.k_softmaxes, n_hidden_units=self.n_hidden_units, non_linearity=self.non_linearity) log.info(f'Parameters:\n{pformat(self.parameters())}') log.info(f'Torch Model:\n{self.model}') self.model.init_weights(initial_embeddings=embeddings) if CUDA: self.model = self.model.cuda() self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate) self.criterion = nn.NLLLoss() # self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True) tb_experiment = ' '.join( f'{param}={value}' for param, value in [('model', 'dan'), ('n_hidden_units', self.n_hidden_units), ('dropout_prob', self.dropout_prob), ('k_softmaxes', self.k_softmaxes), ('non_linearity', self.non_linearity), ('learning_rate', self.learning_rate), ('batch_size', self.batch_size)]) manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_loss', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), '/tmp/dan.pt', monitor='test_loss'), Tensorboard(tb_experiment) ]) log.info('Starting training...') while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(n_batches_train, t_x_train, t_offset_train, t_y_train, evaluate=False) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(n_batches_test, t_x_test, t_offset_test, t_y_test, evaluate=True) stop_training, reasons = manager.instruct(train_time, train_loss, train_acc, test_time, test_loss, test_acc) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_loss) log.info('Done training')
def train(self, training_data: TrainingData) -> None: x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data ) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text] for r in x_train: if len(r) == 0: r.append(embedding_lookup['UNK']) x_train = np.array(x_train) y_train = np.array(y_train) x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text] for r in x_test: if len(r) == 0: r.append(embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.array(y_test) self.n_classes = compute_n_classes(training_data[1]) n_batches_train, t_x_train, t_offset_train, t_y_train = batchify( self.batch_size, x_train, y_train, truncate=True) n_batches_test, t_x_test, t_offset_test, t_y_test = batchify( self.batch_size, x_test, y_test, truncate=False) self.model = DanModel(embeddings.shape[0], self.n_classes) self.model.init_weights(initial_embeddings=embeddings) self.model.cuda() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) self.criterion = nn.CrossEntropyLoss() manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), '/tmp/dan.pt', monitor='test_acc') # Tensorboard('dan', log_dir='tb-logs') ]) log.info('Starting training...') while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch( n_batches_train, t_x_train, t_offset_train, t_y_train, evaluate=False ) self.model.eval() test_acc, test_loss, test_time = self.run_epoch( n_batches_test, t_x_test, t_offset_test, t_y_test, evaluate=True ) stop_training, reasons = manager.instruct( train_time, train_loss, train_acc, test_time, test_loss, test_acc ) if stop_training: log.info(' '.join(reasons)) break log.info('Done training')
def train(self, training_data: TrainingData): x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data) if self.use_wiki: wiki_dataset = FilteredWikipediaDataset() wiki_train_data = wiki_dataset.training_data() w_x_train_text, w_train_y, _, _, _, _, _ = preprocess_dataset( wiki_train_data, train_size=1, vocab=vocab, class_to_i=class_to_i, i_to_class=i_to_class) x_train_text.extend(w_x_train_text) y_train.extend(w_train_y) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True, mask_zero=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup x_train = [ convert_text_to_embeddings_indices(q, embedding_lookup, random_unk_prob=.05) for q in x_train_text ] for r in x_train: if len(r) == 0: r.append(embedding_lookup['UNK']) x_train = np.array(x_train) y_train = np.array(y_train) x_test = [ convert_text_to_embeddings_indices(q, embedding_lookup, random_unk_prob=.05) for q in x_test_text ] for r in x_test: if len(r) == 0: r.append(embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.array(y_test) self.n_classes = compute_n_classes(training_data[1]) n_batches_train, t_x_train, lengths_train, t_y_train, _ = batchify( self.batch_size, x_train, y_train, truncate=True) n_batches_test, t_x_test, lengths_test, t_y_test, _ = batchify( self.batch_size, x_test, y_test, truncate=False) self.model = RnnModel(embeddings.shape[0], self.n_classes) self.model.init_weights(embeddings=embeddings) self.model.cuda() self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, 'max', patience=5, verbose=True) manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), '/tmp/rnn.pt', monitor='test_acc') #Tensorboard('rnn', log_dir='tb-logs') ]) log.info('Starting training...') while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch(n_batches_train, t_x_train, lengths_train, t_y_train, evaluate=False) self.model.eval() test_acc, test_loss, test_time = self.run_epoch(n_batches_test, t_x_test, lengths_test, t_y_test, evaluate=True) stop_training, reasons = manager.instruct(train_time, train_loss, train_acc, test_time, test_loss, test_acc) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) log.info('Done training')
def train(self, training_data: TrainingData) -> None: if self.use_qb: x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data ) if self.use_wiki: wiki_dataset = WikipediaDataset(set(training_data[1])) wiki_train_data = wiki_dataset.training_data() w_x_train_text, w_train_y, _, _, _, _, _ = preprocess_dataset( wiki_train_data, train_size=1, vocab=vocab, class_to_i=class_to_i, i_to_class=i_to_class ) x_train_text.extend(w_x_train_text) y_train.extend(w_train_y) else: if self.use_wiki: wiki_dataset = WikipediaDataset(set(training_data[1])) wiki_train_data = wiki_dataset.training_data() x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( wiki_train_data ) else: raise ValueError('use_wiki and use_qb cannot both be false, otherwise there is no training data') self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text] for r in x_train: if len(r) == 0: r.append(embedding_lookup['UNK']) x_train = np.array(x_train) y_train = np.array(y_train) x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text] for r in x_test: if len(r) == 0: r.append(embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.array(y_test) self.n_classes = compute_n_classes(training_data[1]) log.info(f'Batching: {len(x_train)} train questions and {len(x_test)} test questions') n_batches_train, t_x_train, t_offset_train, t_y_train = batchify( self.batch_size, x_train, y_train, truncate=True) n_batches_test, t_x_test, t_offset_test, t_y_test = batchify( self.batch_size, x_test, y_test, truncate=False) self.vocab_size = embeddings.shape[0] self.model = DanModel(self.vocab_size, self.n_classes, embeddings=embeddings) if CUDA: self.model = self.model.cuda() log.info(f'Model:\n{self.model}') self.optimizer = Adam(self.model.parameters(), lr=self.learning_rate) self.criterion = nn.CrossEntropyLoss() self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max') manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), '/tmp/dan.pt', monitor='test_acc') ]) log.info('Starting training...') while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch( n_batches_train, t_x_train, t_offset_train, t_y_train, evaluate=False ) self.model.eval() test_acc, test_loss, test_time = self.run_epoch( n_batches_test, t_x_test, t_offset_test, t_y_test, evaluate=True ) stop_training, reasons = manager.instruct( train_time, train_loss, train_acc, test_time, test_loss, test_acc ) if stop_training: log.info(' '.join(reasons)) break else: self.scheduler.step(test_acc) log.info('Done training')
def train(self, training_data) -> None: log.info('Preprocessing data') x_train_text, y_train, x_test_text, y_test, vocab, class_to_i, i_to_class = preprocess_dataset( training_data ) self.class_to_i = class_to_i self.i_to_class = i_to_class self.vocab = vocab embeddings, embedding_lookup = load_embeddings(vocab=vocab, expand_glove=True, mask_zero=True) self.embeddings = embeddings self.embedding_lookup = embedding_lookup x_train = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_train_text] for r in x_train: if len(r) == 0: r.append(embedding_lookup['UNK']) x_train = np.array(x_train) y_train = np.array(y_train) x_test = [convert_text_to_embeddings_indices(q, embedding_lookup) for q in x_test_text] for r in x_test: if len(r) == 0: r.append(embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.array(y_test) log.info('Batching data') n_batches_train, t_x_train, lengths_train, masks_train, t_y_train = batchify( self.batch_size, x_train, y_train, truncate=True ) n_batches_test, t_x_test, lengths_test, masks_test, t_y_test = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False ) self.n_classes = compute_n_classes(training_data[1]) log.info('Creating model') self.model = BCN( 300, 500, embeddings.shape[0], self.n_classes, We=torch.from_numpy(embeddings) ).cuda() self.optimizer = Adam(self.model.parameters()) self.criterion = nn.NLLLoss() log.info(f'Model:\n{self.model}') manager = TrainingManager([ BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1), MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), '/tmp/bcn.pt', monitor='test_acc'), Tensorboard('bcn') ]) log.info('Starting training...') while True: self.model.train() train_acc, train_loss, train_time = self.run_epoch( n_batches_train, t_x_train, lengths_train, masks_train, t_y_train, evaluate=False ) self.model.eval() test_acc, test_loss, test_time = self.run_epoch( n_batches_test, t_x_test, lengths_test, masks_test, t_y_test, evaluate=True ) stop_training, reasons = manager.instruct( train_time, train_loss, train_acc, test_time, test_loss, test_acc ) if stop_training: log.info(' '.join(reasons)) break