def valid(mdl: FeedforwardNetwork, data_iter: utils.BatchIterator, LANG, args): with torch.no_grad(): preds = [] golds = [] for i, batch in enumerate( tqdm.tqdm(data_iter, total=data_iter.__len__())): pred = run_iter(mdl, batch, None, False, None) preds.extend(LANG.decode(pred)) golds.extend(LANG.decode(batch['lang'])) acc = accuracy_score(golds, preds) * 100 f1 = f1_score(golds, preds, average='macro') * 100 precision = precision_score(golds, preds, average='macro') * 100 recall = recall_score(golds, preds, average='macro') * 100 res = { 'acc': round(acc, 2), 'f1': round(f1, 2), 'precision': round(precision, 2), 'recall': round(recall, 2) } report = classification_report(golds, preds, digits=4) utils.save_txt( report, os.path.join( args.mdir, f'report-acc{acc:.2}-' f'f1{f1:.2}-' f'p{precision:.2}-' f'r{recall:.2}.txt')) return res
def get_iter(dataset, batch_size=100, shuffle=False, cycle=False, log_epoch=-1, seed=0, prefetch=False, num_worker=20, queue_size=300): """Gets a data iterator. Args: dataset: Dataset object. batch_size: Mini-batch size. shuffle: Whether to shuffle the data. cycle: Whether to stop after one full epoch. log_epoch: Log progress after how many iterations. Returns: b: Batch iterator object. """ b = BatchIterator(dataset.get_size(), batch_size=batch_size, shuffle=shuffle, cycle=cycle, get_fn=dataset.get_batch_idx, log_epoch=log_epoch, seed=seed) if prefetch: b = ConcurrentBatchIterator(b, max_queue_size=queue_size, num_threads=num_worker, log_queue=-1) return b
def learn(in_data: ndarray, out_data: ndarray, test_in_data: ndarray, test_out_data: ndarray, model_func: Callable[[Tensor, Tensor, bool], Tensor], loss_func: Callable[[Tensor, Tensor, bool], Tensor], optimizer: Optimizer, score_func: Callable[[Tensor, ndarray], float]=None, batch_size: int=100, epoch_number: int=100): input_data = in_data.astype(np.float32) output_data = out_data.astype(np.float32) test_input_data = test_in_data.astype(np.float32) test_output_data = test_out_data.astype(np.float32) train_loss_values = [] test_loss_values = [] test_score_values = [] start = time() for i in range(0, epoch_number): bit = BatchIterator(input_data, output_data, batch_size) iter_loss = 0 for b_in, b_out in bit: x = Tensor(b_in) y = Tensor(b_out) model = model_func(x, y, True) loss = loss_func(y, model, True) iter_loss += loss.data[0] / input_data.shape[0] optimizer.step(loss) if score_func is not None: test_loss, err_ratio = score_test(test_input_data, test_output_data, model_func, loss_func, score_func) else: err_ratio = 'N/A' test_loss = 'N/A' train_loss_values.append(iter_loss) test_loss_values.append(test_loss) test_score_values.append(err_ratio) print("Iteration {0} train-loss: {1}, test-loss: {2}, score: {3}%".format(i, iter_loss, test_loss, err_ratio)) end = time() print("Execution time: {0}s".format(end - start)) return train_loss_values, test_loss_values, test_score_values
def __init__(self, sess, model, dataset, opt, model_opt, outputs, start_idx=-1, end_idx=-1): self.dataset = dataset self.log = logger.get() self.model_opt = model_opt self.opt = opt self.input_variables = self.get_input_variables() if start_idx != -1 and end_idx != -1: if start_idx < 0 or end_idx < 0: self.log.fatal('Indices must be non-negative.') elif start_idx >= end_idx: self.log.fatal('End index must be greater than start index.') num_ex = end_idx - start_idx if end_idx > dataset.get_dataset_size(): self.log.warning('End index exceeds dataset size.') end_idx = dataset.get_dataset_size() num_ex = end_idx - start_idx self.log.info('Running partial dataset: start {} end {}'.format( start_idx, end_idx)) self.all_idx = np.arange(start_idx, end_idx) else: self.log.info('Running through entire dataset.') num_ex = dataset.get_dataset_size() self.all_idx = np.arange(num_ex) if num_ex == -1: num_ex = dataset.get_dataset_size() self.log.info('\nnum_ex: {:d} -- opt batch_size: {:d}\n'.format( num_ex, opt['batch_size'])) batch_iter = BatchIterator(num_ex, batch_size=opt['batch_size'], get_fn=self.get_batch, cycle=False, shuffle=False) if opt['prefetch']: batch_iter = ConcurrentBatchIterator( batch_iter, max_queue_size=opt['queue_size'], num_threads=opt['num_worker'], log_queue=-1) super(OneTimeEvalBase, self).__init__(sess, model, batch_iter, outputs, num_batch=1, phase_train=False, increment_step=False) pass
def _post_requests(self, ss_resource, requests): """Post requests with batchUpdate(). Args: requests (list): Dicts, each of which represents single request. """ if requests: for batch in BatchIterator(requests): ss_resource.batchUpdate(spreadsheetId=self.ss_id, body={ "requests": batch }).execute()
def __init__(self, sess, model, dataset, num_batch, train_opt, model_opt, outputs, step=StepCounter(0), loggers=None, phase_train=True, increment_step=False): self.dataset = dataset self.loggers = loggers self.log = logger.get() self.model_opt = model_opt self.train_opt = train_opt self.input_variables = self.get_input_variables() num_ex = dataset.get_dataset_size() batch_iter = BatchIterator( num_ex, batch_size=train_opt['batch_size'], get_fn=self.get_batch, cycle=True, shuffle=True, log_epoch=-1) if train_opt['prefetch']: batch_iter = ConcurrentBatchIterator( batch_iter, max_queue_size=train_opt['queue_size'], num_threads=train_opt['num_worker'], log_queue=-1) super(Runner, self).__init__( sess, model, batch_iter, outputs, num_batch=num_batch, step=step, phase_train=phase_train, increment_step=increment_step)
color_representation_size=54) #54) model_id = str(int(time.time())) + "w_fourier" save_path = os.path.join(output_path, model_id) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) if cuda: model.cuda() print(model) print(str(datetime.now()), 'Generating batches') train_batches = BatchIterator(train_colors, vocab, batch_size, cuda=cuda) test_batches = BatchIterator(test_colors, vocab, batch_size, cuda=cuda) #optimizer = torch.optim.Adam(model.parameters(), lr=0.05) optimizer = torch.optim.Adagrad(model.parameters(), lr=0.5) pbar = tqdm.trange(epochs, desc='Training...') delta = 0 delta_test = 0 for epoch in pbar: total_loss = 0 total_batches = 0 for i, batch in enumerate(train_batches):
def train_dssm(model, experiment_name, dataset_name='int_1000', evaluate_dataset_name='evaluate_1024_n4', n_negatives=4, evaluate_batch_size=64, use_comet=False, comet_tags=None, comet_disabled=False, save=False, n_trajectories=16, pairs_per_trajectory=4, n_epochs=60, patience_ratio=1, device='cpu', do_eval=True, do_quantize=True, model_kwargs=None, train_on_synthetic_dataset=False): """The master-function for training a model :param model: the model :param experiment_name: name for local saves and Comet.ml :param dataset_name: name of the training and test datasets directory inside datasets/ :param evaluate_dataset_name: name of the 'synthetic' evaluation dataset directory inside datasets/ :param n_negatives: number of negatives for each positive in the 'synthetic' evaluation dataset :param evaluate_batch_size: batch_size to use for the evaluation (or training on the 'synthetic' data) tha actual batch size will be (n_negatives + 1) times larger :param use_comet: bool, whether to use Comet.ml :param comet_tags: tags for the Comet.ml experiment :param comet_disabled: bool, whether to disable Comet.ml logging (for debugging) :param save: bool, whether to save results locally :param n_trajectories: number of trajectories used for a batch :param pairs_per_trajectory: number of pairs from each trajectory used for a batch :param n_epochs: number of training epochs :param patience_ratio: if the validation quality is not increased for n_epocs * patience_ration epochs, initiates early stopping :param device: device to use (usually 'cpu' or 'cuda') :param do_eval: bool. whether to perform valifdation and evaluation :param do_quantize: whether to use quantisation in a DSSMEmbed model :param model_kwargs: model arguments (just for logging) :param train_on_synthetic_dataset: bool, whether the training dataset is 'synthetic' """ np.random.seed(42) dataset_path = f'datasets/{dataset_name}/' evaluate_dataset_path = f'datasets/{evaluate_dataset_name}/' experiment_path_base = 'experiments/' batch_size = n_trajectories * pairs_per_trajectory patience = max(1, int(n_epochs * patience_ratio)) if device == 'cuda': torch.cuda.empty_cache() dtype_for_torch = np.float32 if isinstance( model, DSSM) else int # int for embeddings, float for convolutions # Setup Comet.ml if use_comet: comet_experiment = comet_ml.Experiment(project_name='gridworld_dssm', auto_metric_logging=False, disabled=comet_disabled) comet_experiment.set_name(experiment_name) comet_experiment.log_parameters(model_kwargs) if comet_tags: comet_experiment.add_tags(comet_tags) else: comet_experiment = None # Setup local save path if save: experiment_path = pathlib.Path(experiment_path_base + experiment_name) experiment_path.mkdir(parents=True, exist_ok=True) # Prepare datasets if not train_on_synthetic_dataset: train_batches = BatchIterator(dataset_path + 'train.pkl', dataset_path + 'idx_train.pkl', n_trajectories, pairs_per_trajectory, None, dtype_for_torch) test_batches = BatchIterator(dataset_path + 'test.pkl', dataset_path + 'idx_test.pkl', n_trajectories, pairs_per_trajectory, None, dtype_for_torch) else: train_batches = EvaluationBatchIterator(dataset_path + 'train.pkl', dataset_path + 'idx_train.pkl', n_negatives, evaluate_batch_size, dtype_for_torch) test_batches = EvaluationBatchIterator(dataset_path + 'test.pkl', dataset_path + 'idx_test.pkl', n_negatives, evaluate_batch_size, dtype_for_torch) evaluate_batches = EvaluationBatchIterator( evaluate_dataset_path + 'evaluate.pkl', evaluate_dataset_path + 'idx_evaluate.pkl', n_negatives, evaluate_batch_size, dtype_for_torch) # Setup other training stuff criterion = nn.CrossEntropyLoss() downscale_factor = n_negatives + 1 evaluate_target = torch.zeros(evaluate_batch_size, dtype=torch.long).to(device) if not train_on_synthetic_dataset: target = torch.arange(0, batch_size).to(device) train_downscale_factor = 1 else: target = evaluate_target train_downscale_factor = downscale_factor optimizer = torch.optim.Adam(model.parameters()) test_accs = [] best_test_acc = -1 best_epoch = -1 model = model.to(device) # Training loop tqdm_range = tqdm(range(n_epochs)) for epoch in tqdm_range: # Train mode = 'train' results = run_model(model, optimizer, train_batches, device, criterion, target, mode, do_quantize, train_downscale_factor) # Log all metrics if use_comet: for key, value in results.items(): if key != 'z_inds_count': comet_experiment.log_metric(f'{mode}_{key}', value, epoch=epoch) comet_experiment.log_metric('dssm_scale', torch.exp(model.scale).item(), epoch=epoch) if (isinstance(model, DSSMEmbed) or isinstance(model, DSSMReverse)) and do_quantize: z_inds_count = results['z_inds_count'] z_inds_count = z_inds_count / z_inds_count.sum() comet_experiment.log_text('Counts: ' + ' '.join(f'{num:.1%}' for num in z_inds_count)) if do_eval: # Validate mode = 'validate' results = run_model(model, optimizer, test_batches, device, criterion, target, mode, do_quantize, train_downscale_factor) # Log all metrics if use_comet: for key, value in results.items(): if key != 'z_inds_count': comet_experiment.log_metric(f'{mode}_{key}', value, epoch=epoch) # Log embedding distance matrix if isinstance(model, DSSMEmbed) or isinstance( model, DSSMReverse): z_vectors = model.z_vectors_norm if isinstance( model, DSSMEmbed) else model.get_z_vectors() z_vectors = z_vectors.detach() z_vectors_batch = z_vectors.unsqueeze(0) embed_dist_matr = torch.cdist( z_vectors_batch, z_vectors_batch).squeeze().cpu().numpy() np.fill_diagonal( embed_dist_matr, torch.sqrt((z_vectors**2).sum(axis=1)).cpu().numpy()) comet_experiment.log_confusion_matrix( matrix=embed_dist_matr, title='Embeddings distance matrix') test_accs.append(results['accuracy']) # Evaluate mode = 'evaluate' results = run_model(model, optimizer, evaluate_batches, device, criterion, evaluate_target, mode, do_quantize, downscale_factor) # Log all metrics if use_comet: for key, value in results.items(): if key != 'z_inds_count': comet_experiment.log_metric(f'{mode}_{key}', value, epoch=epoch) # Save model (best) locally if test_accs[-1] > best_test_acc: best_test_acc = test_accs[-1] best_epoch = epoch if save: torch.save(model.state_dict(), experiment_path / 'best_model.pth') tqdm_range.set_postfix(test_acc=test_accs[-1], eval_acc=results['accuracy']) # Stop if validation accuracy isn't going up if epoch > best_epoch + patience: n_epochs_run = epoch break else: if save: torch.save(model.state_dict(), experiment_path / 'best_model.pth') else: # If not break: n_epochs_run = n_epochs # Save experiment data and log to Comet.ml if save or use_comet: info = dict(dataset_path=dataset_path, batch_size=batch_size, n_trajectories=n_trajectories, pairs_per_trajectory=pairs_per_trajectory, n_epochs=n_epochs, n_epochs_run=n_epochs_run, best_epoch=best_epoch, best_test_acc=best_test_acc, device=str(device), dtype_for_torch=dtype_for_torch) if save: with open(experiment_path / 'info.json', 'w') as f: json.dump(info.update(model_kwargs), f, indent=4) with open(experiment_path / 'model_kwargs.pkl', 'wb') as f: pickle.dump(model_kwargs, f) with open(experiment_path_base + 'summary.csv', 'a') as f: f.write(experiment_name + f',{best_test_acc}') if use_comet: comet_experiment.log_parameters(info) # save experiment key for later logging experiment_keys_path = pathlib.Path('experiments/comet_keys.pkl') if not experiment_keys_path.exists(): experiment_keys = defaultdict(list) else: with open(experiment_keys_path, 'rb') as f: experiment_keys = pickle.load(f) experiment_keys[experiment_name].append(comet_experiment.get_key()) with open(experiment_keys_path, 'wb') as f: pickle.dump(experiment_keys, f)
def fit(self, X, y, X_val=None, y_val=None, on_epoch=None, verbose=False): """ Train the model Parameters ---------- X : input vector for the training set y : output vector for the training set. Onehot is required for classification X_val : if not None, input vector for the validation set y_val : it not None, input vector for the validation set on_epoch : a callback that is called after each epoch if X_val is None, the signature is (epoch, training_error, accuracy) if X_val is not None, the signature is (epoch, training_error, validation_error, accuracy) on iterations that update pi the training error is reported for the previous iteration verbose : if True, spams current step on each epoch """ self._create_functions() X = X.astype(np.float32) y = y.astype(np.float32) self._x_mean = np.mean(X, axis=0) self._x_std = np.std(X, axis=0) self._x_std[self._x_std == 0] = 1 X = (X - self._x_mean) / self._x_std if y_val is not None: assert X_val is not None X_val = X_val.astype(np.float32) y_val = y_val.astype(np.float32) X_val = (X_val - self._x_mean) / self._x_std if X_val is not None: assert y_val is not None predictions = self._predict_internal(self._get_output()) accuracy = T.mean( T.eq(predictions, self._predict_internal(self.t_label))) test_function = theano.function( [self.t_input, self.t_label], [self._get_loss_function(), accuracy]) iterator = BatchIterator(self._batch_size) loss = 0 for epoch in range(self._num_epochs): # update the values of pi if not __DEBUG_NO_FOREST__ and epoch % self._sgd_iters == 0: if verbose: print "updating pi" self.l_forest.update_pi(X, y) if verbose: print "recreating update funcs" self._create_functions() else: if verbose: print "updating theta" loss = 0 deno = 0 # update the network parameters for Xb, yb in iterator(X, y): loss += self._train_function(Xb, yb) deno += 1 loss /= deno if X_val is not None: tloss = 0 accur = 0 deno = 0 iterator = BatchIterator(self._batch_size) for Xb, yb in iterator(X_val, y_val): tl, ac = test_function(Xb, yb) tloss += tl accur += ac deno += 1 tloss /= deno accur /= deno if on_epoch is not None: if X_val is None: on_epoch(epoch, loss, self) else: on_epoch(epoch, loss, tloss, accur, self) return self
def main(): args = get_args() print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print() # Load dataset print('Load dataset') train_iter = BatchIterator(args.batchsize, os.path.join(data_dir, "train")) dev_iter = BatchIterator(args.batchsize, os.path.join(data_dir, "dev")) test_iter = BatchIterator(args.batchsize, os.path.join(data_dir, "test")) max_length = max(train_iter.get_maxlen(), dev_iter.get_maxlen(), test_iter.get_maxlen()) train_iter.set_maxlen(max_length) dev_iter.set_maxlen(max_length) test_iter.set_maxlen(max_length) # Define graph print('Defining graph') graph = tf.Graph() optimizer = tf.train.AdamOptimizer() with graph.as_default(): input_ph = tf.placeholder(tf.float32, shape=[None, max_length, n_input_units]) labels_indices = tf.placeholder(tf.int64) labels_values = tf.placeholder(tf.int32) labels_shape = tf.placeholder(tf.int64) labels = tf.SparseTensor(labels_indices, labels_values, labels_shape) seq_lengths_ph = tf.placeholder(tf.int32) output_op = _inference(input_ph, seq_lengths_ph) loss_op = _loss(output_op, labels, seq_lengths_ph) #vis_loss_op = _vis_loss(loss_op, seq_lengths_ph) train_op = optimizer.minimize(loss_op) error_op = _error(output_op, labels, seq_lengths_ph) with tf.Session(graph=graph) as session: print('initializing') tf.initialize_all_variables().run() headers = ["epoch", "train_loss", "dev_loss", "dev_PER"] result_writer = ResultWriter(headers) for epoch in xrange(args.epoch): print('-' * 50) print('Epoch', epoch + 1, '...') # train train_losses = [] progress = 0 for x_batches, t_batches, seq_lengths in train_iter: feed_dict = { input_ph: x_batches, labels_indices: t_batches[0], labels_values: t_batches[1], labels_shape: t_batches[2], seq_lengths_ph: seq_lengths, } loss, _ = session.run([loss_op, train_op], feed_dict=feed_dict) train_losses.append(loss) progress += len(x_batches) print_progress(train_iter.size(), progress) print() train_loss = np.average(train_losses) print("train loss: {0}".format(train_loss)) # dev dev_losses = [] dev_errors = [] progress = 0 for x_batches, t_batches, seq_lengths in dev_iter: feedDict = { input_ph: x_batches, labels_indices: t_batches[0], labels_values: t_batches[1], labels_shape: t_batches[2], seq_lengths_ph: seq_lengths, } loss, error = session.run([loss_op, error_op], feed_dict=feedDict) dev_losses.append(loss) dev_errors.append(error) progress += len(x_batches) print_progress(dev_iter.size(), progress) print() dev_loss = np.average(dev_losses) dev_error = np.average(dev_errors) print("dev loss: {0}, dev FER: {1}".format(dev_loss, dev_error)) result_writer.write(epoch + 1, train_loss, dev_loss, dev_error) result_writer.close()
def test_multi_pass(): import cifar_exp_config as cifar_conf from data import CIFAR10Dataset from utils import BatchIterator import os if os.path.exists("/ais/gobi4"): folder = "/ais/gobi4/mren/data/cifar-10" else: folder = "/home/mren/data/cifar-10" data = CIFAR10Dataset(folder=folder, split="valid") config = cifar_conf.BaselineConfig() b = BatchIterator(data.get_size(), batch_size=8, shuffle=False, cycle=False, get_fn=data.get_batch_idx) # Testing the batch iterator. b1 = b.next() b.reset() b2 = b.next() np.testing.assert_almost_equal(b1["img"], b2["img"]) b.reset() config.pool_fn = ["avg_pool", "avg_pool", "avg_pool"] num_rep = 4 num_pas = 2 learn_rate = 1.0 decimal_tol = 5 num_elem_dbg = 3 wlist = [ "mlp/layer_1/w", "mlp/layer_0/w", "cnn/layer_2/w", "cnn/layer_1/w", "cnn/layer_0/w", "mlp/layer_1/b", "mlp/layer_0/b", "cnn/layer_2/b", "cnn/layer_1/b", "cnn/layer_0/b" ] for wname in wlist: with log.verbose_level(2): ###################################### # Run the MultiPass model. ###################################### with tf.Graph().as_default(): s1 = tf.Session() with tf.variable_scope("Model"): m1 = MultiPassMultiTowerModel(config, num_replica=num_rep, num_passes=num_pas) tf.set_random_seed(1234) s1.run(tf.initialize_all_variables()) m1.assign_lr(s1, learn_rate) batch = b.next() with tf.variable_scope("Model", reuse=True): w1 = s1.run(tf.get_variable(wname)) ce1 = m1.train_step(s1, batch["img"], batch["label"]) with tf.variable_scope("Model", reuse=True): w1d = s1.run(tf.get_variable(wname)) b.reset() ###################################### # Run the regular MultiTower model. ###################################### with tf.Graph().as_default(): s2 = tf.Session() with tf.variable_scope("Model2") as scope: m2 = MultiTowerModel(config, num_replica=num_rep) tf.set_random_seed(1234) s2.run(tf.initialize_all_variables()) m2.assign_lr(s2, learn_rate) with tf.variable_scope("Model2", reuse=True): w2 = s2.run(tf.get_variable(wname)) ce2 = m2.train_step(s2, batch["img"], batch["label"]) with tf.variable_scope("Model2", reuse=True): w2d = s2.run(tf.get_variable(wname)) b.reset() ###################################### # Run the regular model. ###################################### with tf.Graph().as_default(): s3 = tf.Session() with tf.variable_scope("Model3") as scope: m3 = CNNModel(config) tf.set_random_seed(1234) s3.run(tf.initialize_all_variables()) m3.assign_lr(s3, learn_rate) with tf.variable_scope("Model3", reuse=True): w3 = s3.run(tf.get_variable(wname)) ce3 = m3.train_step(s3, batch["img"], batch["label"]) with tf.variable_scope("Model3", reuse=True): w3d = s3.run(tf.get_variable(wname)) b.reset() # Make this block one indent level to avoid logging. ###################################### # Make sure the weights are the same. ###################################### log.info("Testing {}".format(wname)) print_w("w1", w1, num_elem_dbg) print_w("w2", w2, num_elem_dbg) print_w("w3", w3, num_elem_dbg) np.testing.assert_almost_equal(w1, w2, decimal=decimal_tol) np.testing.assert_almost_equal(w2, w3, decimal=decimal_tol) ###################################### # Make sure the gradients are the same. ###################################### print_w("w1 delta", w1d - w1, num_elem_dbg) print_w("w2 delta", w2d - w2, num_elem_dbg) print_w("w3 delta", w3d - w3, num_elem_dbg) print_w("w1 new", w1d, num_elem_dbg) print_w("w2 new", w2d, num_elem_dbg) print_w("w3 new", w3d, num_elem_dbg) np.testing.assert_almost_equal(get_diff_signature(w1, w1d), get_diff_signature(w2, w2d), decimal=decimal_tol) np.testing.assert_almost_equal(get_diff_signature(w2, w2d), get_diff_signature(w3, w3d), decimal=decimal_tol) log.info("Success")