def main(): dataset = Dataset() dataset.read_name_data() rnn = RNN(dataset.n_letters, n_hidden, dataset.n_categories) start = time.time() current_loss = 0 for iter in range(1, n_iters + 1): category, line, category_tensor, line_tensor = \ randomTrainingExample(dataset.all_categories, dataset.category_lines) output, loss = train(rnn, category_tensor, line_tensor) current_loss += loss # Print iter number, loss, name and guess if iter % print_every == 0: guess, guess_i = categoryFromOutput(output, dataset) correct = '✓' if guess == category else '✗ (%s)' % category print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct)) # Add current loss avg to list of losses if iter % plot_every == 0: all_losses.append(current_loss / plot_every) current_loss = 0 # Plotting the historical loss plt.figure() plt.plot(all_losses) plt.title("The negative log likelihood(NLL) loss per iter") plt.xlabel("n_iter") plt.ylabel("NLL loss") plt.show() # Evaluate the trained RNN. confusion = torch.zeros(dataset.n_categories, dataset.n_categories) n_confusion = 10000 for i in range(n_confusion): category, line, category_tensor, line_tensor = \ randomTrainingExample(dataset.all_categories, dataset.category_lines) output = evalute(rnn, line_tensor) guess, guess_i = categoryFromOutput(output, dataset) category_i = dataset.all_categories.index(category) confusion[category_i][guess_i] += 1 for i in range(dataset.n_categories): confusion[i] = confusion[i] / confusion[i].sum() # Set up plot fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(confusion.numpy()) fig.colorbar(cax) # Set up axes ax.set_xticklabels([''] + dataset.all_categories, rotation=90) ax.set_yticklabels([''] + dataset.all_categories) # Force label at every tick ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) # sphinx_gallery_thumbnail_number = 2 plt.show()
def main(): writer = SummaryWriter() timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') log_dirpath = "./log_" + timestamp os.mkdir(log_dirpath) handlers = [logging.FileHandler( log_dirpath + "/deep_lpf.log"), logging.StreamHandler()] logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', handlers=handlers) parser = argparse.ArgumentParser( description="Train the DeepLPF neural network on image pairs") parser.add_argument( "--num_epoch", type=int, required=False, help="Number of epoches (default 5000)", default=100000) parser.add_argument( "--valid_every", type=int, required=False, help="Number of epoches after which to compute validation accuracy", default=25) parser.add_argument( "--checkpoint_filepath", required=False, help="Location of checkpoint file", default=None) parser.add_argument( "--inference_img_dirpath", required=False, help="Directory containing images to run through a saved DeepLPF model instance", default=None) args = parser.parse_args() num_epoch = args.num_epoch valid_every = args.valid_every checkpoint_filepath = args.checkpoint_filepath inference_img_dirpath = args.inference_img_dirpath logging.info('######### Parameters #########') logging.info('Number of epochs: ' + str(num_epoch)) logging.info('Logging directory: ' + str(log_dirpath)) logging.info('Dump validation accuracy every: ' + str(valid_every)) logging.info('##############################') training_data_loader = Adobe5kDataLoader(data_dirpath="/home/sjm213/adobe5k/adobe5k/", img_ids_filepath="/home/sjm213/adobe5k/adobe5k/images_train.txt") training_data_dict = training_data_loader.load_data() training_dataset = Dataset(data_dict=training_data_dict, transform=transforms.Compose( [transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ToTensor()]), normaliser=2 ** 8 - 1, is_valid=False) validation_data_loader = Adobe5kDataLoader(data_dirpath="/home/sjm213/adobe5k/adobe5k/", img_ids_filepath="/home/sjm213/adobe5k/adobe5k/images_valid.txt") validation_data_dict = validation_data_loader.load_data() validation_dataset = Dataset(data_dict=validation_data_dict, transform=transforms.Compose([transforms.ToTensor()]), normaliser=2 ** 8 - 1, is_valid=True) testing_data_loader = Adobe5kDataLoader(data_dirpath="/home/sjm213/adobe5k/adobe5k/", img_ids_filepath="/home/sjm213/adobe5k/adobe5k/images_test.txt") testing_data_dict = testing_data_loader.load_data() testing_dataset = Dataset(data_dict=testing_data_dict, transform=transforms.Compose([transforms.ToTensor()]), normaliser=2 ** 8 - 1, is_valid=True) training_data_loader = torch.utils.data.DataLoader(training_dataset, batch_size=1, shuffle=True, num_workers=10) testing_data_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=1, shuffle=False, num_workers=10) validation_data_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=False, num_workers=10) if (checkpoint_filepath is not None) and (inference_img_dirpath is not None): inference_data_loader = Adobe5kDataLoader(data_dirpath=inference_img_dirpath, img_ids_filepath=inference_img_dirpath+"/images_inference.txt") inference_data_dict = inference_data_loader.load_data() inference_dataset = Dataset(data_dict=inference_data_dict, transform=transforms.Compose([transforms.ToTensor()]), normaliser=2 ** 8 - 1, is_valid=True) inference_data_loader = torch.utils.data.DataLoader(inference_dataset, batch_size=1, shuffle=False, num_workers=10) ''' Performs inference on all the images in inference_img_dirpath ''' logging.info( "Performing inference with images in directory: " + inference_img_dirpath) net = torch.load(checkpoint_filepath, map_location=lambda storage, location: storage) # switch model to evaluation mode net.eval() criterion = model.DeepLPFLoss() testing_evaluator = metric.Evaluator( criterion, inference_data_loader, "test", log_dirpath) testing_evaluator.evaluate(net, epoch=0) else: print(torch.cuda.is_available()) net = model.DeepLPFNet() net.cuda(0) logging.info('######### Network created #########') logging.info('Architecture:\n' + str(net)) for name, param in net.named_parameters(): if param.requires_grad: print(name) criterion = model.DeepLPFLoss(ssim_window_size=5) ''' The following objects allow for evaluation of a model on the testing and validation splits of a dataset ''' validation_evaluator = metric.Evaluator( criterion, validation_data_loader, "valid", log_dirpath) testing_evaluator = metric.Evaluator( criterion, testing_data_loader, "test", log_dirpath) optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-4, betas=(0.9, 0.999), eps=1e-08) best_valid_psnr = 0.0 alpha = 0.0 optimizer.zero_grad() net.train() running_loss = 0.0 examples = 0 psnr_avg = 0.0 ssim_avg = 0.0 batch_size = 1 total_examples = 0 for epoch in range(num_epoch): # Train loss examples = 0.0 running_loss = 0.0 for batch_num, data in enumerate(training_data_loader, 0): input_img_batch, output_img_batch, category = Variable(data['input_img'], requires_grad=False).cuda(), Variable(data['output_img'], requires_grad=False).cuda(), data[ 'name'] start_time = time.time() net_output_img_batch = net( input_img_batch) net_output_img_batch = torch.clamp( net_output_img_batch, 0.0, 1.0) elapsed_time = time.time() - start_time loss = criterion(net_output_img_batch, output_img_batch) optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.data[0] examples += batch_size total_examples+=batch_size writer.add_scalar('Loss/train', loss.data[0], total_examples) logging.info('[%d] train loss: %.15f' % (epoch + 1, running_loss / examples)) writer.add_scalar('Loss/train_smooth', running_loss / examples, epoch + 1) # Valid loss ''' examples = 0.0 running_loss = 0.0 for batch_num, data in enumerate(validation_data_loader, 0): net.eval() input_img_batch, output_img_batch, category = Variable( data['input_img'], requires_grad=False).cuda(), Variable(data['output_img'], requires_grad=False).cuda(), \ data[ 'name'] net_output_img_batch = net( input_img_batch) net_output_img_batch = torch.clamp( net_output_img_batch, 0.0, 1.0) optimizer.zero_grad() loss = criterion(net_output_img_batch, output_img_batch) running_loss += loss.data[0] examples += batch_size total_examples+=batch_size writer.add_scalar('Loss/train', loss.data[0], total_examples) logging.info('[%d] valid loss: %.15f' % (epoch + 1, running_loss / examples)) writer.add_scalar('Loss/valid_smooth', running_loss / examples, epoch + 1) net.train() ''' if (epoch + 1) % valid_every == 0: logging.info("Evaluating model on validation and test dataset") valid_loss, valid_psnr, valid_ssim = validation_evaluator.evaluate( net, epoch) test_loss, test_psnr, test_ssim = testing_evaluator.evaluate( net, epoch) # update best validation set psnr if valid_psnr > best_valid_psnr: logging.info( "Validation PSNR has increased. Saving the more accurate model to file: " + 'deeplpf_validpsnr_{}_validloss_{}_testpsnr_{}_testloss_{}_epoch_{}_model.pt'.format(valid_psnr, valid_loss.tolist()[0], test_psnr, test_loss.tolist()[ 0], epoch)) best_valid_psnr = valid_psnr snapshot_prefix = os.path.join( log_dirpath, 'deeplpf') snapshot_path = snapshot_prefix + '_validpsnr_{}_validloss_{}_testpsnr_{}_testloss_{}_epoch_{}_model.pt'.format(valid_psnr, valid_loss.tolist()[ 0], test_psnr, test_loss.tolist()[ 0], epoch) torch.save(net.state_dict(), snapshot_path) net.train() ''' Run the network over the testing dataset split ''' testing_evaluator.evaluate(net, epoch=0) snapshot_prefix = os.path.join(log_dirpath, 'deep_lpf') snapshot_path = snapshot_prefix + "_" + str(num_epoch) torch.save(net.state_dict(), snapshot_path)
def train(args): use_tarantella = eval(args['training']['use_tarantella']) ndims_tot = np.prod(eval(args['data']['data_dimensions'])) output_dir = args['checkpoints']['output_dir'] sched_milestones = eval(args['training']['milestones_lr_decay']) n_epochs = eval(args['training']['N_epochs']) optimizer_kwargs = eval(args['training']['optimizer_kwargs']) optimizer_type = args['training']['optimizer'] optimizer_lr = eval(args['training']['lr']) if use_tarantella: import tarantella # no argument (otherwise: ranks per node) tarantella.init() node_rank = tarantella.get_rank() nodes_number = tarantella.get_size() else: node_rank = 0 nodes_number = 1 is_primary_node = (node_rank == 0) args['training']['rank'] = repr(node_rank) args['training']['comm_size'] = repr(nodes_number) model = build_model(args) data = Dataset(args) print(f'NODE_RANK {node_rank}') print(f'N_NODES {nodes_number}') print(f'NODE_RANK {str(is_primary_node).upper()}', flush=True) def nll_loss_z_part(y, z): zz = tf.math.reduce_mean(z**2) return 0.5 * zz def nll_loss_jac_part(y, jac): return -tf.math.reduce_mean(jac) / ndims_tot def lr_sched(ep, lr): if ep in sched_milestones: return 0.1 * lr return lr # TODO: should this only be for one node, or for each? lr_scheduler_callback = kr.callbacks.LearningRateScheduler( lr_sched, verbose=is_primary_node) callbacks = [lr_scheduler_callback, kr.callbacks.TerminateOnNaN()] if is_primary_node: #checkpoint_callback = kr.callbacks.ModelCheckpoint(filepath=os.path.join(output_dir, 'checkpoint_best.hdf5'), #save_best_only=True, #save_weights_only=True, #mode='min', #verbose=is_primary_node) loss_log_callback = kr.callbacks.CSVLogger(os.path.join( output_dir, 'losses.dat'), separator=' ') #callbacks.append(checkpoint_callback) callbacks.append(loss_log_callback) try: optimizer_type = { 'ADAM': kr.optimizers.Adam, 'SGD': kr.optimizers.SGD }[optimizer_type] except KeyError: optimizer_type = eval(optimizer_type) optimizer = optimizer_type(optimizer_lr, **optimizer_kwargs) if use_tarantella: model = tarantella.Model(model) model.compile(loss=[nll_loss_z_part, nll_loss_jac_part], optimizer=optimizer, run_eagerly=False) model.build((128, 32, 32, 3)) try: history = model.fit( data.train_dataset, epochs=n_epochs, verbose=is_primary_node, callbacks=callbacks, validation_data=(data.test_dataset if is_primary_node else None)) except: raise
def setUp(self): self.fields = {'source': Field(), 'target': Field()} self.examples = {'source': [1, 2, 3], 'target': [1, 2, 3]} self.dataset = Dataset(self.examples, self.fields)
def main(args): dataset = Dataset(args) os.makedirs(args.save_dir, exist_ok=True) with open(os.path.join(args.save_dir, 'dataset_info'), 'wb') as wf: pickle.dump(dataset.dataset_info, wf) if args.task == 'rhyme': with open(os.path.join(args.save_dir, 'rhyme_info'), 'wb') as wf: pickle.dump(dataset.rhyme_info, wf) if args.ckpt: checkpoint = torch.load(args.ckpt, map_location=args.device) start_epoch = checkpoint['epoch'] + 1 best_val_metric = checkpoint['best_metric'] model_args = checkpoint['args'] model = Model( model_args, dataset.gpt_pad_id, len(dataset.index2word), rhyme_group_size=len(dataset.index2rhyme_group) if args.task == 'rhyme' else None ) # no need to get the glove embeddings when reloading since they're saved in model ckpt anyway model.load_state_dict(checkpoint['state_dict']) model = model.to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=model_args.lr) optimizer.load_state_dict(checkpoint['optimizer']) data_start_index = checkpoint['data_start_index'] print("=> loaded checkpoint '{}' (epoch {})".format( args.ckpt, checkpoint['epoch'])) # NOTE: just import pdb after loading the model here if you want to play with it, it's easy # model.eval() # import pdb; pdb.set_trace() else: model = Model(args, dataset.gpt_pad_id, len(dataset.index2word), rhyme_group_size=len(dataset.index2rhyme_group) if args.task == 'rhyme' else None, glove_embeddings=dataset.glove_embeddings) model = model.to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_val_metric = 1e8 # lower is better for BCE data_start_index = 0 print('num params', num_params(model)) criterion = nn.BCEWithLogitsLoss().to(args.device) if args.evaluate: epoch = 0 validate(model, dataset, criterion, epoch, args) return for epoch in range(args.epochs): print("TRAINING: Epoch {} at {}".format(epoch, time.ctime())) data_start_index = train(model, dataset, optimizer, criterion, epoch, args, data_start_index) if epoch % args.validation_freq == 0: print("VALIDATION: Epoch {} at {}".format(epoch, time.ctime())) metric = validate(model, dataset, criterion, epoch, args) if not args.debug: if metric < best_val_metric: print('new best val metric', metric) best_val_metric = metric save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_metric': best_val_metric, 'optimizer': optimizer.state_dict(), 'data_start_index': data_start_index, 'args': args }, os.path.join(args.save_dir, 'model_best.pth.tar')) save_checkpoint( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_metric': metric, 'optimizer': optimizer.state_dict(), 'data_start_index': data_start_index, 'args': args }, os.path.join(args.save_dir, 'model_epoch' + str(epoch) + '.pth.tar'))
def train(model, loss, params): train_data = Dataset( os.path.join(params.data_dir, "train.txt"), params.vocab, ) total_loss = 0. global_step = 0 start_time = time.time() best_valid_loss = None hidden = model.init_hidden(params.batch_size) lrate = params.lr for epoch in range(params.epochs): for bidx, batch in enumerate( train_data.batcher( params.batch_size, params.num_steps, )): model.train() x, t = batch x = x.to(params.device) t = t.to(params.device) hidden = repackage_hidden(hidden) model.zero_grad() logits, hidden = model(x, state=hidden) gloss = loss(logits.view(-1, logits.size(-1)), t.view(-1)) gloss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params.clip) for p in model.parameters(): p.data.add_(-lrate, p.grad.data) total_loss += gloss.item() if global_step > 0 \ and global_step % params.disp_freq == 0: sub_loss = total_loss / params.disp_freq duration = time.time() - start_time print('| Train | epoch {:3d} | {:5d} batches | ' 'lr {:.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, bidx, lrate, duration * 1000 / params.disp_freq, sub_loss, math.exp(sub_loss))) total_loss = 0. start_time = time.time() global_step += 1 # start evaluation # keep the batch_size as default, since we do not need so # accurate batch_size score, speed = eval(model, loss, os.path.join(params.data_dir, 'dev.txt'), params) print('| Dev | epoch {:3d} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format(epoch, speed, score, math.exp(score))) if not best_valid_loss or score < best_valid_loss: best_valid_loss = score with open(params.save, 'wb') as f: torch.save(model, f) else: lrate /= 4.0
from data import Dataset from network import Network import os import datetime, re batchsize = 10 #visualize_after = 1000 visualize_image_after = 2000 / batchsize #matrix = np.random.rand(matrix_h, matrix_w, 3) * (matrix_max_xy - matrix_min_xy) + matrix_min_xy image_filename = "woman.png" #image_filename = "parrot.png" #image_filename = "parrot.png" dataset = Dataset(image_filename) #alpha = 0.99 alpha = 0.991 strmost = 1 strmost_increase_after = 25000 * 4 strmost_increase_until = strmost_increase_after * 2 strmost_final = 30 step = 1 # Create logdir name logdir = "logs/{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"))
if __name__ == "__main__": print('test bpe tok ...') bpe_tok = src_tok = Tokenizer( 'en', ['bpe:/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share'], '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share.vocab') trg_tok = Tokenizer( 'de', ['bpe:/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share'], '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share.vocab') print("prepare model ...") gnmt = GNMT(src_tok, trg_tok, 512, 0.1).cuda() print('setup dataset ...') dataset = Dataset(src_tok, trg_tok, '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000') dataset.setup() print(f'done.') tensorboard = SummaryWriter() train_tqdm = tqdm(dataset.train_dataloader(20000)) for b in train_tqdm: def closure(): gnmt.zero_grad() loss, acc = gnmt.train_step(b) loss.backward() train_tqdm.set_postfix({'loss': loss.item(), 'acc': acc.item()}) tensorboard.add_scalar('train/loss', loss.item(), train_tqdm.n) tensorboard.add_scalar('train/acc', acc.item(), train_tqdm.n) return loss
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) train_data = Dataset(args.train_file) val_data = Dataset(args.val_file) #train_loader = DataLoader(dataset=train_data, shuffle=True) #val_loader = DataLoader(dataset=val_data, shuffle=True) train_sents = train_data.batch_size.sum() vocab_size = int(train_data.vocab_size) max_len = max(val_data.sents.size(1), train_data.sents.size(1)) print('Train: %d sents / %d batches, Val: %d sents / %d batches' % (train_data.sents.size(0), len(train_data), val_data.sents.size(0), len(val_data))) print('Vocab size: %d, Max Sent Len: %d' % (vocab_size, max_len)) print('Save Path', args.save_path) #cuda.set_device(args.gpu) model = GeneralCompPCFG(vocab=vocab_size, state_dim=args.state_dim, t_states=args.t_states, nt_states=args.nt_states, h_dim=args.h_dim, w_dim=args.w_dim, z_dim=args.z_dim, prior=args.prior, vpost=args.vpost) # model parallelize base_gpu = torch.device('cuda:0') model.to(base_gpu) model = BetterDataParallel(model) for name, param in model.named_parameters(): if param.dim() > 1: xavier_uniform_(param) print("model architecture") print(model) model.train() # model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) best_val_ppl = 1e5 best_val_f1 = 0 epoch = 0 num_gpus = torch.cuda.device_count() while epoch < args.num_epochs: start_time = time.time() epoch += 1 print('Starting epoch %d' % epoch) train_nll = 0. train_kl = 0. num_sents = 0. num_words = 0. all_stats = [[0., 0., 0.]] b = 0 n_gpus = torch.cuda.device_count() for i in np.random.permutation(len(train_data)): b += 1 sents, length, batch_size, _, gold_spans, gold_binary_trees, _ = train_data[ i] if length > args.max_length or length == 1: #length filter based on curriculum continue with open("batchsize.log", 'a') as fp: fp.write(str(batch_size) + "\n") if batch_size == 0 or batch_size % n_gpus != 0: continue #if batch_size == 0 or batch_size % num_gpus != 0: #gpu paraellization filter # continue # sents = sents.cuda() #sents = sents.to(base_gpu) optimizer.zero_grad() nll, kl, binary_matrix, argmax_spans = model(sents, argmax=True) (nll + kl).mean().backward() train_nll += nll.sum().item() train_kl += kl.sum().item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() num_sents += batch_size num_words += batch_size * ( length + 1 ) # we implicitly generate </s> so we explicitly count it for bb in range(batch_size): span_b = [(a[0].item(), a[1].item()) for a in argmax_spans[bb]] #ignore labels span_b_set = set(span_b[:-1]) update_stats(span_b_set, [set(gold_spans[bb][:-1])], all_stats) if b % args.print_every == 0: all_f1 = get_f1(all_stats) param_norm = sum([p.norm()**2 for p in model.parameters()]).item()**0.5 gparam_norm = sum([ p.grad.norm()**2 for p in model.parameters() if p.grad is not None ]).item()**0.5 log_str = 'Epoch: %d, Batch: %d/%d, |Param|: %.6f, |GParam|: %.2f, LR: %.4f, ' + \ 'ReconPPL: %.2f, KL: %.4f, PPLBound: %.2f, ValPPL: %.2f, ValF1: %.2f, ' + \ 'CorpusF1: %.2f, Throughput: %.2f examples/sec' print(log_str % (epoch, b, len(train_data), param_norm, gparam_norm, args.lr, np.exp(train_nll / num_words), train_kl / num_sents, np.exp((train_nll + train_kl) / num_words), best_val_ppl, best_val_f1, all_f1[0], num_sents / (time.time() - start_time))) # print an example parse tree = get_tree_from_binary_matrix(binary_matrix[0], length) action = get_actions(tree) sent_str = [ train_data.idx2word[word_idx] for word_idx in list(sents[0].cpu().numpy()) ] print("Pred Tree: %s" % get_tree(action, sent_str)) print("Gold Tree: %s" % get_tree(gold_binary_trees[0], sent_str)) with open("dummy_output.log", 'a') as fp: fp.write( log_str % (epoch, b, len(train_data), param_norm, gparam_norm, args.lr, np.exp(train_nll / num_words), train_kl / num_sents, np.exp((train_nll + train_kl) / num_words), best_val_ppl, best_val_f1, all_f1[0], num_sents / (time.time() - start_time)) + "\n") e = datetime.datetime.now() fp.write("The time is now: = %s:%s:%s" % (e.hour, e.minute, e.second) + "\n") with open("gpu_stats.log", "a") as fp: for i in range(torch.cuda.device_count()): fp.write( "GPU %d: %d\n" % (i, torch.cuda.max_memory_allocated("cuda:" + str(i)))) fp.write("\n") args.max_length = min(args.final_max_length, args.max_length + args.len_incr) print('--------------------------------') print('Checking validation perf...') val_ppl, val_f1 = eval(val_data, model) print('--------------------------------') if val_ppl < best_val_ppl: best_val_ppl = val_ppl best_val_f1 = val_f1 checkpoint = { 'args': args.__dict__, 'model': model.cpu(), 'word2idx': train_data.word2idx, 'idx2word': train_data.idx2word } print('Saving checkpoint to %s' % args.save_path) torch.save(checkpoint, args.save_path) # model.cuda() model.to(base_gpu)
type=int, help="whether load pretrained model") parser.add_argument('--gpu_index', default='0', type=str, help="whether load pretrained model") FLAGS, _ = parser.parse_known_args() log('Settings') utils.showFLAGS(FLAGS) #%% set logger logger = LOGGER(FLAGS) log('Create Logger Successfully') #%% set train data dataset = Dataset(FLAGS, logger) n_images = len(dataset.train_names) inputsize = dataset.input_size #%% create model os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_index img_ch = 1 img_height, img_width = inputsize[0], inputsize[1] inputs = Input((img_height, img_width, img_ch)) x = Conv2D(64, kernel_size=(9, 9), padding='valid')(inputs) x = Activation('relu')(x) x = Conv2D(32, kernel_size=(1, 1), padding='valid')(x) x = Activation('relu')(x) x = Conv2D(1, kernel_size=(5, 5), padding='valid')(x) x = Activation('relu')(x) SRCNN = Model(inputs=inputs, outputs=x, name='SRCNN')
cls_file = os.path.join(label_path, id + "_cls.npy") mask_file = os.path.join(label_path, id + "_nd.npy") copyfile(cls_source, cls_file) copyfile(mask_source, mask_file) if __name__ == '__main__': # train on the GPU or on the CPU, if a GPU is not available device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # our dataset has three classes only - background, non-damaged, and damaged num_classes = 6 # 3 or 6 dataset_test = Dataset("./datasets/Eureka_infer/102/", "./datasets/Eureka_infer/102_labels/", get_transform(train=False), readsave=False) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=2, collate_fn=utils.collate_fn) mask_rcnn = get_model_instance_segmentation(num_classes, image_mean=None, image_std=None, stats=False) mask_rcnn.load_state_dict( torch.load("trained_param_eureka_aug_mult/epoch_0021.param"))
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) train_data = Dataset(args.train_file) val_data = Dataset(args.val_file) test_data = Dataset(args.test_file) train_sents = train_data.batch_size.sum() vocab_size = int(train_data.vocab_size) logger.info('Train data: %d batches' % len(train_data)) logger.info('Val data: %d batches' % len(val_data)) logger.info('Test data: %d batches' % len(test_data)) logger.info('Word vocab size: %d' % vocab_size) checkpoint_dir = args.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) suffix = "%s_%s.pt" % (args.model, 'bl') checkpoint_path = os.path.join(checkpoint_dir, suffix) if args.slurm == 0: cuda.set_device(args.gpu) if args.train_from == '': model = RNNVAE(vocab_size=vocab_size, enc_word_dim=args.enc_word_dim, enc_h_dim=args.enc_h_dim, enc_num_layers=args.enc_num_layers, dec_word_dim=args.dec_word_dim, dec_h_dim=args.dec_h_dim, dec_num_layers=args.dec_num_layers, dec_dropout=args.dec_dropout, latent_dim=args.latent_dim, mode=args.model) for param in model.parameters(): param.data.uniform_(-0.1, 0.1) else: logger.info('loading model from ' + args.train_from) checkpoint = torch.load(args.train_from) model = checkpoint['model'] logger.info("model architecture") print(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if args.warmup == 0: args.beta = 1. else: args.beta = 0.1 criterion = nn.NLLLoss() model.cuda() criterion.cuda() model.train() def variational_loss(input, sents, model, z=None): mean, logvar = input z_samples = model._reparameterize(mean, logvar, z) preds = model._dec_forward(sents, z_samples) nll = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(preds.size(1)) ]) kl = utils.kl_loss_diag(mean, logvar) return nll + args.beta * kl update_params = list(model.dec.parameters()) meta_optimizer = OptimN2N(variational_loss, model, update_params, eps=args.eps, lr=[args.svi_lr1, args.svi_lr2], iters=args.svi_steps, momentum=args.momentum, acc_param_grads=args.train_n2n == 1, max_grad_norm=args.svi_max_grad_norm) if args.test == 1: args.beta = 1 test_data = Dataset(args.test_file) eval(test_data, model, meta_optimizer) exit() t = 0 best_val_nll = 1e5 best_epoch = 0 val_stats = [] epoch = 0 while epoch < args.num_epochs: start_time = time.time() epoch += 1 logger.info('Starting epoch %d' % epoch) train_nll_vae = 0. train_nll_autoreg = 0. train_kl_vae = 0. train_nll_svi = 0. train_kl_svi = 0. train_kl_init_final = 0. num_sents = 0 num_words = 0 b = 0 for i in np.random.permutation(len(train_data)): if args.warmup > 0: args.beta = min( 1, args.beta + 1. / (args.warmup * len(train_data))) sents, length, batch_size = train_data[i] if args.gpu >= 0: sents = sents.cuda() b += 1 optimizer.zero_grad() if args.model == 'autoreg': preds = model._dec_forward(sents, None, True) nll_autoreg = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_autoreg += nll_autoreg.data[0] * batch_size nll_autoreg.backward() elif args.model == 'svi': mean_svi = Variable( 0.1 * torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad=True) logvar_svi = Variable( 0.1 * torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad=True) var_params_svi = meta_optimizer.forward( [mean_svi, logvar_svi], sents, b % args.print_every == 0) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final.detach(), logvar_svi_final.detach()) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) train_kl_svi += kl_svi.data[0] * batch_size var_loss = nll_svi + args.beta * kl_svi var_loss.backward(retain_graph=True) else: mean, logvar = model._enc_forward(sents) z_samples = model._reparameterize(mean, logvar) preds = model._dec_forward(sents, z_samples) nll_vae = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_vae += nll_vae.data[0] * batch_size kl_vae = utils.kl_loss_diag(mean, logvar) train_kl_vae += kl_vae.data[0] * batch_size if args.model == 'vae': vae_loss = nll_vae + args.beta * kl_vae vae_loss.backward(retain_graph=True) if args.model == 'savae': var_params = torch.cat([mean, logvar], 1) mean_svi = Variable(mean.data, requires_grad=True) logvar_svi = Variable(logvar.data, requires_grad=True) var_params_svi = meta_optimizer.forward( [mean_svi, logvar_svi], sents, b % args.print_every == 0) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final, logvar_svi_final) preds = model._dec_forward(sents, z_samples) nll_svi = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(length) ]) train_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) train_kl_svi += kl_svi.data[0] * batch_size var_loss = nll_svi + args.beta * kl_svi var_loss.backward(retain_graph=True) if args.train_n2n == 0: if args.train_kl == 1: mean_final = mean_svi_final.detach() logvar_final = logvar_svi_final.detach() kl_init_final = utils.kl_loss( mean, logvar, mean_final, logvar_final) train_kl_init_final += kl_init_final.data[ 0] * batch_size kl_init_final.backward(retain_graph=True) else: vae_loss = nll_vae + args.beta * kl_vae var_param_grads = torch.autograd.grad( vae_loss, [mean, logvar], retain_graph=True) var_param_grads = torch.cat(var_param_grads, 1) var_params.backward(var_param_grads, retain_graph=True) else: var_param_grads = meta_optimizer.backward( [mean_svi_final.grad, logvar_svi_final.grad], b % args.print_every == 0) var_param_grads = torch.cat(var_param_grads, 1) var_params.backward(var_param_grads) if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) optimizer.step() num_sents += batch_size num_words += batch_size * length if b % args.print_every == 0: param_norm = sum([p.norm()**2 for p in model.parameters()]).data[0]**0.5 logger.info( 'Iters: %d, Epoch: %d, Batch: %d/%d, LR: %.4f, TrainARNLL: %.4f, TrainARPPL: %.2f, TrainVAE_NLL: %.4f, TrainVAE_REC: %.4f, TrainVAE_KL: %.4f, TrainVAE_PPL: %.2f, TrainSVI_NLL: %.2f, TrainSVI_REC: %.2f, TrainSVI_KL: %.4f, TrainSVI_PPL: %.2f, KLInitFinal: %.2f, |Param|: %.4f, BestValPerf: %.2f, BestEpoch: %d, Beta: %.4f, Throughput: %.2f examples/sec' % (t, epoch, b + 1, len(train_data), args.lr, train_nll_autoreg / num_sents, np.exp(train_nll_autoreg / num_words), (train_nll_vae + train_kl_vae) / num_sents, train_nll_vae / num_sents, train_kl_vae / num_sents, np.exp((train_nll_vae + train_kl_vae) / num_words), (train_nll_svi + train_kl_svi) / num_sents, train_nll_svi / num_sents, train_kl_svi / num_sents, np.exp((train_nll_svi + train_kl_svi) / num_words), train_kl_init_final / num_sents, param_norm, best_val_nll, best_epoch, args.beta, num_sents / (time.time() - start_time))) epoch_train_time = time.time() - start_time logger.info('Time Elapsed: %.1fs' % epoch_train_time) logger.info('--------------------------------') logger.info('Checking validation perf...') logger.record_tabular('Epoch', epoch) logger.record_tabular('Mode', 'Val') logger.record_tabular('LR', args.lr) logger.record_tabular('Epoch Train Time', epoch_train_time) val_nll = eval(val_data, model, meta_optimizer) val_stats.append(val_nll) logger.info('--------------------------------') logger.info('Checking test perf...') logger.record_tabular('Epoch', epoch) logger.record_tabular('Mode', 'Test') logger.record_tabular('LR', args.lr) logger.record_tabular('Epoch Train Time', epoch_train_time) test_nll = eval(test_data, model, meta_optimizer) if val_nll < best_val_nll: best_val_nll = val_nll best_epoch = epoch model.cpu() checkpoint = { 'args': args.__dict__, 'model': model, 'val_stats': val_stats } logger.info('Save checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) model.cuda() else: if epoch >= args.min_epochs: args.decay = 1
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch RoofNet test') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() #print(torch.cuda.is_available()) #torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {} # Data preprocessing preprocessing = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # Dataset loading and splitting data, weights = load_dataset(data_type = 'train', verified = True) validation_split = .05 random_seed= 42 shuffle_dataset = True # Creating data indices for training and validation splits: dataset_size = len(data) split = int(np.floor(validation_split * dataset_size)) # Shuffle data if shuffle_dataset : np.random.seed(random_seed) np.random.shuffle(data) train_data, val_data = data[split:], data[:split] train_dataset, val_dataset = Dataset(train_data, preprocessing), Dataset(val_data, preprocessing, train = False) #full_dataset = Dataset(data, preprocessing) #train_size = int(0.8 * len(full_dataset)) #val_size = len(full_dataset) - train_size #train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size]) # Setting batch data loaders train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=True, **kwargs) # Creating model and loading on gpu #model = RoofNet().to(device) model = RoofEnsemble().to(device) print(model) model = nn.DataParallel(model) #As multi-gpu in Keras # Optimizer #optimizer = optim.ASGD(model.parameters(), lr=0.01, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) #optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) # Class weights weights = torch.from_numpy(weights).to(device) best = 100 writer = SummaryWriter(log_dir='./logs/5', max_queue=1) # max_queue=1 to flush data at every add # Training and testing phase for epoch in range(1, args.epochs + 1): train_loss, train_acc = train(args, model, device, train_loader, optimizer, epoch, weights) val_loss, test_acc = test(args, model, device, val_loader, weights) if val_loss < best: torch.save(model.module.state_dict(), 'roof_cnn_best.pt') best = val_loss writer.add_scalar('Loss/train', train_loss, epoch) writer.add_scalar('Loss/test', val_loss, epoch) writer.add_scalar('Acc/train', train_acc, epoch) writer.add_scalar('Acc/test', test_acc, epoch) # Saving model if (args.save_model): torch.save(model.module.state_dict(), 'roof_cnn.pt')
from inception_v3 import inception_v3 from data import Dataset import tensorflow as tf import cv2 import numpy as np # Constants IMAGE_WIDTH = 299 IMAGE_HEIGHT = 299 EPOCHS = 100 BATCH_SIZE = 50 # Load dataset ndsb = Dataset('train', IMAGE_HEIGHT, IMAGE_WIDTH) ndsb.read_data() num_classes = ndsb.num_classes # Placeholder inputs and output inputs = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT, IMAGE_WIDTH, 3]) predict = tf.placeholder(tf.float32, [None, num_classes]) # Get model y_conv, some = inception_v3(inputs, num_classes=num_classes) print(y_conv.shape) print(predict.shape) # Cross entropy graph cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=predict, logits=y_conv))
def main(): args = parser.parse_args() # create model model = create_model(args.model, num_classes=args.num_classes, in_chans=3, pretrained=args.pretrained, checkpoint_path=args.checkpoint) print('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) config = resolve_data_config(model, args) model, test_time_pool = apply_test_time_pool(model, config, args) if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model = model.cuda() loader = create_loader( Dataset(args.data), input_size=config['input_size'], batch_size=args.batch_size, use_prefetcher=True, interpolation=config['interpolation'], mean=config['mean'], std=config['std'], num_workers=args.workers, crop_pct=1.0 if test_time_pool else config['crop_pct']) model.eval() batch_time = AverageMeter() end = time.time() top5_ids = [] with torch.no_grad(): for batch_idx, (input, _) in enumerate(loader): input = input.cuda() labels = model(input) top5 = labels.topk(5)[1] top5_ids.append(top5.cpu().numpy()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if batch_idx % args.print_freq == 0: print( 'Predict: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'.format( batch_idx, len(loader), batch_time=batch_time)) top5_ids = np.concatenate(top5_ids, axis=0).squeeze() with open(os.path.join(args.output_dir, './top5_ids.csv'), 'w') as out_file: filenames = loader.dataset.filenames() for filename, label in zip(filenames, top5_ids): filename = os.path.basename(filename) out_file.write('{0},{1},{2},{3},{4},{5}\n'.format( filename, label[0], label[1], label[2], label[3], label[4]))
def train(params): # status measure if params.recorder.estop or \ params.recorder.epoch > params.epoches or \ params.recorder.step > params.max_training_steps: tf.logging.info("Stop condition reached, you have finished training your model.") return 0. # loading dataset tf.logging.info("Begin Loading Training and Dev Dataset") start_time = time.time() train_dataset = Dataset(params.src_train_file, params.tgt_train_file, params.src_vocab, params.tgt_vocab, params.max_len, batch_or_token=params.batch_or_token, data_leak_ratio=params.data_leak_ratio) dev_dataset = Dataset(params.src_dev_file, params.src_dev_file, params.src_vocab, params.src_vocab, params.eval_max_len, batch_or_token='batch', data_leak_ratio=params.data_leak_ratio) tf.logging.info( "End Loading dataset, within {} seconds".format(time.time() - start_time)) # Build Graph with tf.Graph().as_default(): lr = tf.placeholder(tf.as_dtype(dtype.floatx()), [], "learn_rate") # shift automatically sliced multi-gpu process into `zero` manner :) features = [] for fidx in range(max(len(params.gpus), 1)): feature = { "source": tf.placeholder(tf.int32, [None, None], "source"), "target": tf.placeholder(tf.int32, [None, None], "target"), } features.append(feature) # session info sess = util.get_session(params.gpus) tf.logging.info("Begining Building Training Graph") start_time = time.time() # create global step global_step = tf.train.get_or_create_global_step() # set up optimizer optimizer = tf.train.AdamOptimizer(lr, beta1=params.beta1, beta2=params.beta2, epsilon=params.epsilon) # get graph graph = model.get_model(params.model_name) # set up training graph loss, gradients = tower_train_graph(features, optimizer, graph, params) # apply pseudo cyclic parallel operation vle, ops = cycle.create_train_op({"loss": loss}, gradients, optimizer, global_step, params) tf.logging.info("End Building Training Graph, within {} seconds".format(time.time() - start_time)) tf.logging.info("Begin Building Inferring Graph") start_time = time.time() # set up infer graph eval_seqs, eval_scores = tower_infer_graph(features, graph, params) tf.logging.info("End Building Inferring Graph, within {} seconds".format(time.time() - start_time)) # initialize the model sess.run(tf.global_variables_initializer()) # log parameters util.variable_printer() # create saver train_saver = saver.Saver( checkpoints=params.checkpoints, output_dir=params.output_dir, best_checkpoints=params.best_checkpoints, ) tf.logging.info("Training") cycle_counter = 0 data_on_gpu = [] cum_tokens = [] # restore parameters tf.logging.info("Trying restore pretrained parameters") train_saver.restore(sess, path=params.pretrained_model) tf.logging.info("Trying restore existing parameters") train_saver.restore(sess) # setup learning rate params.lrate = params.recorder.lrate adapt_lr = lrs.get_lr(params) start_time = time.time() start_epoch = params.recorder.epoch for epoch in range(start_epoch, params.epoches + 1): params.recorder.epoch = epoch tf.logging.info("Training the model for epoch {}".format(epoch)) size = params.batch_size if params.batch_or_token == 'batch' \ else params.token_size train_queue = queuer.EnQueuer( train_dataset.batcher(size, buffer_size=params.buffer_size, shuffle=params.shuffle_batch, train=True), lambda x: x, worker_processes_num=params.process_num, input_queue_size=params.input_queue_size, output_queue_size=params.output_queue_size, ) adapt_lr.before_epoch(eidx=epoch) for lidx, data in enumerate(train_queue): if params.train_continue: if lidx <= params.recorder.lidx: segments = params.recorder.lidx // 5 if params.recorder.lidx < 5 or lidx % segments == 0: tf.logging.info( "{} Passing {}-th index according to record".format(util.time_str(time.time()), lidx)) continue params.recorder.lidx = lidx data_on_gpu.append(data) # use multiple gpus, and data samples is not enough # make sure the data is fully added # The actual batch size: batch_size * num_gpus * update_cycle if len(params.gpus) > 0 and len(data_on_gpu) < len(params.gpus): continue # increase the counter by 1 cycle_counter += 1 if cycle_counter == 1: # calculate adaptive learning rate adapt_lr.step(params.recorder.step) # clear internal states sess.run(ops["zero_op"]) # data feeding to gpu placeholders feed_dicts = {} for fidx, shard_data in enumerate(data_on_gpu): # define feed_dict feed_dict = { features[fidx]["source"]: shard_data["src"], features[fidx]["target"]: shard_data["tgt"], lr: adapt_lr.get_lr(), } feed_dicts.update(feed_dict) # collect target tokens cum_tokens.append(np.sum(shard_data['tgt'] > 0)) # reset data points on gpus data_on_gpu = [] # internal accumulative gradient collection if cycle_counter < params.update_cycle: sess.run(ops["collect_op"], feed_dict=feed_dicts) # at the final step, update model parameters if cycle_counter == params.update_cycle: cycle_counter = 0 # directly update parameters, usually this works well if not params.safe_nan: _, loss, gnorm, pnorm, gstep = sess.run( [ops["train_op"], vle["loss"], vle["gradient_norm"], vle["parameter_norm"], global_step], feed_dict=feed_dicts) if np.isnan(loss) or np.isinf(loss) or np.isnan(gnorm) or np.isinf(gnorm): tf.logging.error("Nan or Inf raised! Loss {} GNorm {}.".format(loss, gnorm)) params.recorder.estop = True break else: # Notice, applying safe nan can help train the big model, but sacrifice speed loss, gnorm, pnorm, gstep = sess.run( [vle["loss"], vle["gradient_norm"], vle["parameter_norm"], global_step], feed_dict=feed_dicts) if np.isnan(loss) or np.isinf(loss) or np.isnan(gnorm) or np.isinf(gnorm) \ or gnorm > params.gnorm_upper_bound: tf.logging.error( "Nan or Inf raised, GStep {} is passed! Loss {} GNorm {}.".format(gstep, loss, gnorm)) continue sess.run(ops["train_op"], feed_dict=feed_dicts) if gstep % params.disp_freq == 0: end_time = time.time() tf.logging.info( "{} Epoch {}, GStep {}~{}, LStep {}~{}, " "Loss {:.3f}, GNorm {:.3f}, PNorm {:.3f}, Lr {:.5f}, " "Src {}, Tgt {}, Tokens {}, UD {:.3f} s".format( util.time_str(end_time), epoch, gstep - params.disp_freq + 1, gstep, lidx - params.disp_freq + 1, lidx, loss, gnorm, pnorm, adapt_lr.get_lr(), data['src'].shape, data['tgt'].shape, np.sum(cum_tokens), end_time - start_time) ) start_time = time.time() cum_tokens = [] # trigger model saver if gstep > 0 and gstep % params.save_freq == 0: train_saver.save(sess, gstep) params.recorder.save_to_json(os.path.join(params.output_dir, "record.json")) # trigger model evaluation if gstep > 0 and gstep % params.eval_freq == 0: if params.ema_decay > 0.: sess.run(ops['ema_backup_op']) sess.run(ops['ema_assign_op']) tf.logging.info("Start Evaluating") eval_start_time = time.time() tranes, scores, indices = evalu.decoding( sess, features, eval_seqs, eval_scores, dev_dataset, params) bleu = evalu.eval_metric(tranes, params.tgt_dev_file, indices=indices) eval_end_time = time.time() tf.logging.info("End Evaluating") if params.ema_decay > 0.: sess.run(ops['ema_restore_op']) tf.logging.info( "{} GStep {}, Scores {}, BLEU {}, Duration {:.3f} s".format( util.time_str(eval_end_time), gstep, np.mean(scores), bleu, eval_end_time - eval_start_time) ) # save eval translation evalu.dump_tanslation( tranes, os.path.join(params.output_dir, "eval-{}.trans.txt".format(gstep)), indices=indices) # save parameters train_saver.save(sess, gstep, bleu) # check for early stopping valid_scores = [v[1] for v in params.recorder.valid_script_scores] if len(valid_scores) == 0 or bleu > np.max(valid_scores): params.recorder.bad_counter = 0 else: params.recorder.bad_counter += 1 if params.recorder.bad_counter > params.estop_patience: params.recorder.estop = True break params.recorder.history_scores.append((gstep, float(np.mean(scores)))) params.recorder.valid_script_scores.append((gstep, float(bleu))) params.recorder.save_to_json(os.path.join(params.output_dir, "record.json")) # handle the learning rate decay in a typical manner adapt_lr.after_eval(float(bleu)) # trigger temporary sampling if gstep > 0 and gstep % params.sample_freq == 0: tf.logging.info("Start Sampling") decode_seqs, decode_scores = sess.run( [eval_seqs[:1], eval_scores[:1]], feed_dict={features[0]["source"]: data["src"][:5]}) tranes, scores = evalu.decode_hypothesis(decode_seqs, decode_scores, params) for sidx in range(min(5, len(scores))): sample_source = evalu.decode_target_token(data['src'][sidx], params.src_vocab) tf.logging.info("{}-th Source: {}".format(sidx, ' '.join(sample_source))) sample_target = evalu.decode_target_token(data['tgt'][sidx], params.tgt_vocab) tf.logging.info("{}-th Target: {}".format(sidx, ' '.join(sample_target))) sample_trans = tranes[sidx] tf.logging.info("{}-th Translation: {}".format(sidx, ' '.join(sample_trans))) tf.logging.info("End Sampling") # trigger stopping if gstep >= params.max_training_steps: # stop running by setting EStop signal params.recorder.estop = True break # should be equal to global_step params.recorder.step = gstep if params.recorder.estop: tf.logging.info("Early Stopped!") break # reset to 0 params.recorder.lidx = -1 adapt_lr.after_epoch(eidx=epoch) # Final Evaluation tf.logging.info("Start Final Evaluating") if params.ema_decay > 0.: sess.run(ops['ema_backup_op']) sess.run(ops['ema_assign_op']) gstep = int(params.recorder.step + 1) eval_start_time = time.time() tranes, scores, indices = evalu.decoding(sess, features, eval_seqs, eval_scores, dev_dataset, params) bleu = evalu.eval_metric(tranes, params.tgt_dev_file, indices=indices) eval_end_time = time.time() tf.logging.info("End Evaluating") if params.ema_decay > 0.: sess.run(ops['ema_restore_op']) tf.logging.info( "{} GStep {}, Scores {}, BLEU {}, Duration {:.3f} s".format( util.time_str(eval_end_time), gstep, np.mean(scores), bleu, eval_end_time - eval_start_time) ) # save eval translation evalu.dump_tanslation( tranes, os.path.join(params.output_dir, "eval-{}.trans.txt".format(gstep)), indices=indices) tf.logging.info("Your training is finished :)") return train_saver.best_score
def __init__(self, config=defaults, name=None): self.config = config self.dataset = Dataset(name=name) self.document = Document() self.texts = [] self.tags = []
def scorer(params): # loading dataset tf.logging.info("Begin Loading Test Dataset") start_time = time.time() test_dataset = Dataset(params.src_test_file, params.tgt_test_file, params.src_vocab, params.tgt_vocab, params.eval_max_len, batch_or_token='batch', data_leak_ratio=params.data_leak_ratio) tf.logging.info( "End Loading dataset, within {} seconds".format(time.time() - start_time)) # Build Graph with tf.Graph().as_default(): features = [] for fidx in range(max(len(params.gpus), 1)): feature = { "source": tf.placeholder(tf.int32, [None, None], "source"), "target": tf.placeholder(tf.int32, [None, None], "target"), } features.append(feature) # session info sess = util.get_session(params.gpus) tf.logging.info("Begining Building Evaluation Graph") start_time = time.time() # get graph graph = model.get_model(params.model_name) # set up infer graph eval_scores = tower_score_graph(features, graph, params) tf.logging.info("End Building Inferring Graph, within {} seconds".format(time.time() - start_time)) # set up ema if params.ema_decay > 0.: # recover from EMA ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay) ema.apply(tf.trainable_variables()) ema_assign_op = tf.group(*(tf.assign(var, ema.average(var).read_value()) for var in tf.trainable_variables())) else: ema_assign_op = tf.no_op() # initialize the model sess.run(tf.global_variables_initializer()) # log parameters util.variable_printer() # create saver eval_saver = saver.Saver(checkpoints=params.checkpoints, output_dir=params.output_dir) # restore parameters tf.logging.info("Trying restore existing parameters") eval_saver.restore(sess, params.output_dir) sess.run(ema_assign_op) tf.logging.info("Starting Evaluating") eval_start_time = time.time() scores, ppl = evalu.scoring(sess, features, eval_scores, test_dataset, params) eval_end_time = time.time() tf.logging.info( "{} Scores {}, PPL {}, Duration {}s".format( util.time_str(eval_end_time), np.mean(scores), ppl, eval_end_time - eval_start_time) ) # save translation evalu.dump_tanslation(scores, params.test_output) return np.mean(scores)
def main(): args = parser.parse_args() print(args) if args.img_size is None: args.img_size, args.crop_pct = get_image_size_crop_pct(args.model) if not args.checkpoint and not args.pretrained: args.pretrained = True if args.torchscript: geffnet.config.set_scriptable(True) # create model model = geffnet.create_model(args.model, num_classes=args.num_classes, in_chans=3, pretrained=args.pretrained, checkpoint_path=args.checkpoint) if args.torchscript: torch.jit.optimized_execution(True) model = torch.jit.script(model) print('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(model, args) criterion = nn.CrossEntropyLoss() if not args.no_cuda: if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model = model.cuda() criterion = criterion.cuda() if args.tune: model.eval() model.fuse_model() conf_yaml = "conf_" + args.model + ".yaml" from lpot.experimental import Quantization, common quantizer = Quantization(conf_yaml) quantizer.model = common.Model(model) q_model = quantizer() q_model.save(args.tuned_checkpoint) exit(0) valdir = os.path.join(args.data, 'val') loader = create_loader(Dataset(valdir, load_bytes=args.tf_preprocessing), input_size=data_config['input_size'], batch_size=args.batch_size, use_prefetcher=not args.no_cuda, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, crop_pct=data_config['crop_pct'], tensorflow_preprocessing=args.tf_preprocessing) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() model.eval() model.fuse_model() if args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) else: new_model = model with torch.no_grad(): for i, (input, target) in enumerate(loader): if i >= args.warmup_iterations: start = time.time() if not args.no_cuda: target = target.cuda() input = input.cuda() # compute output output = new_model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) if i >= args.warmup_iterations: # measure elapsed time batch_time.update(time.time() - start) if i % args.print_freq == 0: print( 'Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, loss=losses, top1=top1, top5=top5)) if args.iterations > 0 and i >= args.iterations + args.warmup_iterations - 1: break print('Batch size = %d' % args.batch_size) if args.batch_size == 1: print('Latency: %.3f ms' % (batch_time.avg * 1000)) print('Throughput: %.3f images/sec' % (args.batch_size / batch_time.avg)) print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'.format( top1=(top1.avg / 100), top5=(top5.avg / 100)))
def ensemble(total_params): # loading dataset tf.logging.info("Begin Loading Test Dataset") start_time = time.time() # assume that different configurations use the same test file default_params = total_params[0] # assume that different models share the same source and target vocabulary, usually it's the case test_dataset = Dataset(default_params.src_test_file, default_params.src_test_file, default_params.src_vocab, default_params.src_vocab, default_params.eval_max_len, batch_or_token='batch', data_leak_ratio=default_params.data_leak_ratio) tf.logging.info( "End Loading dataset, within {} seconds".format(time.time() - start_time)) # Build Graph with tf.Graph().as_default(): features = [] for fidx in range(max(len(default_params.gpus), 1)): feature = { "source": tf.placeholder(tf.int32, [None, None], "source"), } features.append(feature) # session info sess = util.get_session(default_params.gpus) tf.logging.info("Begining Building Evaluation Graph") start_time = time.time() # get graph total_graphs = [model.get_model(params.model_name) for params in total_params] # set up infer graph eval_seqs, eval_scores = tower_ensemble_graph(features, total_graphs, total_params) tf.logging.info("End Building Inferring Graph, within {} seconds".format(time.time() - start_time)) # set up ema # collect ema variables ema_used_models = {} for midx, params in enumerate(total_params): if params.ema_decay > 0.: ema_used_models[params.scope_name + "_ensembler_%d" % midx] = [] for var in tf.trainable_variables(): name = var.op.name key = name[:name.find('/')] if key in ema_used_models: ema_used_models[key].append(var) ema_assign_list = [tf.no_op()] for midx, params in enumerate(total_params): if params.ema_decay > 0.: key = params.scope_name + "_ensembler_%d" % midx ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay) ema.apply(ema_used_models[key]) ema_assign_list += [tf.assign(var, ema.average(var).read_value()) for var in ema_used_models[key]] ema_assign_op = tf.group(*ema_assign_list) # initialize the model sess.run(tf.global_variables_initializer()) # log parameters util.variable_printer() # restore parameters tf.logging.info("Trying restore existing parameters") all_var_list = {} for midx, params in enumerate(total_params): checkpoint = os.path.join(params.output_dir, "checkpoint") assert tf.gfile.Exists(checkpoint) latest_checkpoint = tf.gfile.Open(checkpoint).readline() model_name = latest_checkpoint.strip().split(":")[1].strip() model_name = model_name[1:-1] # remove "" model_path = os.path.join(params.output_dir, model_name) model_path = os.path.abspath(model_path) assert tf.gfile.Exists(model_path + ".meta") tf.logging.warn("Starting Backup Restore {}-th Model".format(midx)) reader = tf.train.load_checkpoint(model_path) # adapt the model names for name, shape in tf.train.list_variables(model_path): model_name = name.split('/')[0] ensemble_name = "{}_ensembler_{}/{}".format(model_name, midx, name[name.find('/') + 1:]) all_var_list[ensemble_name] = reader.get_tensor(name) ops = [] for var in tf.global_variables(): name = var.op.name if name in all_var_list: tf.logging.info('{} **Good**'.format(name)) ops.append( tf.assign(var, all_var_list[name]) ) else: tf.logging.warn("{} --Bad--".format(name)) restore_op = tf.group(*ops, name="restore_global_vars") sess.run(restore_op) sess.run(ema_assign_op) tf.logging.info("Starting Evaluating") eval_start_time = time.time() tranes, scores, indices = evalu.decoding(sess, features, eval_seqs, eval_scores, test_dataset, default_params) bleu = evalu.eval_metric(tranes, default_params.tgt_test_file, indices=indices) eval_end_time = time.time() tf.logging.info( "{} Scores {}, BLEU {}, Duration {}s".format( util.time_str(eval_end_time), np.mean(scores), bleu, eval_end_time - eval_start_time) ) # save translation evalu.dump_tanslation(tranes, default_params.test_output, indices=indices) return bleu
def main(): args = parser.parse_args() if not args.checkpoint and not args.pretrained: args.pretrained = True amp_autocast = suppress # do nothing if args.amp: if not has_native_amp: print( "Native Torch AMP is not available (requires torch >= 1.6), using FP32." ) else: amp_autocast = torch.cuda.amp.autocast # create model model = geffnet.create_model(args.model, num_classes=args.num_classes, in_chans=3, pretrained=args.pretrained, checkpoint_path=args.checkpoint, scriptable=args.torchscript) if args.channels_last: model = model.to(memory_format=torch.channels_last) if args.torchscript: torch.jit.optimized_execution(True) model = torch.jit.script(model) print('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(model, args) criterion = nn.CrossEntropyLoss() if not args.no_cuda: if args.num_gpu > 1: model = torch.nn.DataParallel(model, device_ids=list(range( args.num_gpu))).cuda() else: model = model.cuda() criterion = criterion.cuda() loader = create_loader(Dataset(args.data, load_bytes=args.tf_preprocessing), input_size=data_config['input_size'], batch_size=args.batch_size, use_prefetcher=not args.no_cuda, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, crop_pct=data_config['crop_pct'], tensorflow_preprocessing=args.tf_preprocessing) batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() model.eval() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): if not args.no_cuda: target = target.cuda() input = input.cuda() if args.channels_last: input = input.contiguous(memory_format=torch.channels_last) # compute output with amp_autocast(): output = model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(prec1.item(), input.size(0)) top5.update(prec5.item(), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print( 'Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, loss=losses, top1=top1, top5=top5)) print( ' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})' .format(top1=top1, top1a=100 - top1.avg, top5=top5, top5a=100. - top5.avg))
def load_dataset(): dataset = Dataset(FLAGS.data_path, verbose=True) dataset_size = len(dataset.samples) assert dataset_size > 0 return dataset
from data import Dataset from options import args import os from model import create_model import utils # for k, v in vars(args).items(): # print("{}: {}".format(k, v)) src = args.test_src dest = os.path.join(args.test_dest, args.name) if not os.path.isdir(dest): os.mkdir(dest) dataset = Dataset(src) loader = DataLoader(dataset, batch_size=args.test_batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) model = create_model(args) model.eval() if args.pre_train_model != "...": print("Loading pretrained model ... ") model.load_state_dict(torch.load(args.pre_train_model)) print("Testing...") with torch.no_grad(): for i, (name, lr) in enumerate(loader):
#test_no = df[df.iloc[:,0] == 'testing'].shape[0] #def train(batch_size=4, nb_epoch=10): checkpointer = ModelCheckpoint(filepath=os.path.join('data', 'checkpoints', 'lstm' + '-' + 'features' + '.{epoch:03d}-{val_loss:.3f}.hdf5'),verbose=1,save_best_only=True) tb = TensorBoard(log_dir=os.path.join('data', 'logs', 'lstm')) early_stopper = EarlyStopping(patience=5) timestamp = time.time() csv_logger = CSVLogger(os.path.join('data', 'logs', 'lstm' + '-' + 'training-' + str(timestamp) + '.log')) data = Dataset( seq_length=seq, class_limit=2, ) steps_per_epoch = 4 X, y = data.get_all_sequences_in_memory('training', hyper, seq) X_test, y_test = data.get_all_sequences_in_memory('testing', hyper, seq) #X_test, y_test = data.get_all_sequences_in_memory('testing', cnt, seq) rm = ResearchModels(len(data.classes),'lstm',data.seq_length, None) print("##################################################") #X=X[2:] #X_test=X_test[2:] print(X.shape) X=np.ravel(X)
# cylib.collect_confusion_matrix(y_pred_np.reshape(-1), # y_np.reshape(-1), conf_mat) # conf_mat_all += conf_mat_np.astype(np.uint64) if i % 10 == 0: string = 'batch %03d loss = %.2f (%.1f images/sec)' % \ (i, loss_np, x_np.shape[0] / duration) print(string) print(conf_mat) return utils.print_stats(conf_mat, 'Validation', Dataset.class_info) # BEGINING tf.set_random_seed(31415) train_data = Dataset('train', batch_size) val_data = Dataset('val', batch_size, shuffle=False) height = train_data.height width = train_data.width channels = train_data.channels # x = tf.placeholder(tf.float32, shape=(batch_size, height, width, channels)) # y = tf.placeholder(tf.int32, shape=(batch_size, height, width)) # create placeholders for inputs with tf.name_scope('data'): x = tf.placeholder(tf.float32, shape=(None, height, width, channels), name='rgb_images') y = tf.placeholder(tf.int32, shape=(None, height, width), name='labels')
def __init__(self, dataset=None): super(Evaluator, self).__init__(Dataset() if dataset is None else dataset)
def main(): datasets = { 'cnn+dailymail': read_cnn_dailymail, 'cnn': read_cnn_dailymail, 'daily': read_cnn_dailymail, 'duc2007': read_duc2007, } parser = argparse.ArgumentParser() parser.add_argument('--glove', default='/data/sjx/glove.6B.100d.py36.pkl', help='pickle file of glove') parser.add_argument('--data', default='cnn+dailymail', choices=datasets.keys()) parser.add_argument( '--data-dir', default= '/data/share/cnn_stories/stories;/data/share/dailymail_stories/stories', help= 'If data=cnn+dailimail, then data-dir must contain two paths for cnn and dailymail seperated by ;.' ) parser.add_argument('--save-path', required=True) parser.add_argument('--max-word-num', type=int, default=50000) args = parser.parse_args() print('Loading glove......') glove = pickle.load(open(args.glove, 'rb')) word_dim = len(glove['the']) print('Word dim = %d' % word_dim) print('Reading data......') data, length = datasets[args.data](args.data, args.data_dir) print('train/valid/test: %d/%d/%d' % tuple([len(_) for _ in data])) print('Count word frequency only from train set......') wtof = {} if (args.data == 'duc2007'): pass else: for j in range(len(data[0])): # j-th sample of train set for k in range(2): # 0: content, 1: summary for l in range(len(data[0][j][k])): # l-th sentence for word in data[0][j][k][l]: wtof[word] = wtof.get(word, 0) + 1 wtof = Counter(wtof).most_common(args.max_word_num) needed_words = {w[0]: w[1] for w in wtof} # print('Preserve word num: %d. Examples: %s %s' % (len(needed_words), wtof[0][0], wtof[1][0])) itow = ['<pad>', '<unk>'] wtoi = {'<pad>': 0, '<unk>': 1} count = 2 glove['<pad>'] = np.zeros((word_dim, )) glove['<unk>'] = np.zeros((word_dim, )) missing_word_neighbors = {} print('Replace word string with word index......') if (args.data == 'duc2007'): cnn_data = Dataset(path='/data/c-liang/data/cnndaily_5w_100d.pkl') needed_words = cnn_data.wtoi wtoi = cnn_data.wtoi itow = cnn_data.itow for i in range(len(data)): for j in range(len(data[i])): for k in range(2): # 0: content, 1: summary max_len = max([len(s) for s in data[i][j][k] ]) # max length of sentences for padding for l in range(len(data[i][j][k])): # l-th sentence for m, word in enumerate( data[i][j][k][l]): # m-th word if word not in wtoi: word = '<unk>' data[i][j][k][l][m] = wtoi[word] data[i][j][k][l] += [0] * (max_len - len( data[i][j][k][l])) # padding l-th sentence data[i][j][k] = np.asarray(data[i][j][k], dtype='int32') length[i][j][k] = np.asarray(length[i][j][k], dtype='int32') # np.array for all documents/summaries # shape of each document/summary: (# sentence, max length) else: for i in range(len(data)): for j in range(len(data[i])): for k in range(2): # 0: content, 1: summary max_len = max([len(s) for s in data[i][j][k] ]) # max length of sentences for padding for l in range(len(data[i][j][k])): # l-th sentence for m, word in enumerate( data[i][j][k][l]): # m-th word if word not in needed_words: word = '<unk>' elif word not in wtoi: itow.append(word) wtoi[word] = count count += 1 #print(word) data[i][j][k][l][m] = wtoi[word] # Find neighbor vectors for those words not in glove if word not in glove: if word not in missing_word_neighbors: missing_word_neighbors[word] = [] for neighbor in data[i][j][k][l][ m - 5:m + 6]: # window size: 10 if neighbor in glove: missing_word_neighbors[word].append( glove[neighbor]) if (max_len > len(data[i][j][k][l])): data[i][j][k][l] += [0] * int( max_len - len(data[i][j][k][l])) # padding l-th sentence data[i][j][k] = np.asarray(data[i][j][k], dtype='int32') length[i][j][k] = np.asarray(length[i][j][k], dtype='int32') # np.array for all documents/summaries # shape of each document/summary: (# sentence, max length) print('Calculate vectors for missing words by averaging neighbors......') #print(data) if (args.data == 'duc2007'): weight_matrix = cnn_data.weight else: for word in missing_word_neighbors: vectors = missing_word_neighbors[word] if len(vectors) > 0: glove[word] = sum(vectors) / len(vectors) else: glove[word] = np.zeros((word_dim, )) weight_matrix = np.vstack([glove[w] for w in itow]) print('Shape of weight matrix:') print(weight_matrix.shape) print('Dumping......') #print(data[2][0][0], data[2][1][0]) save_file = open(args.save_path, 'wb') pickle.dump(data, save_file) pickle.dump(length, save_file) pickle.dump(weight_matrix, save_file) pickle.dump(wtoi, save_file) pickle.dump(itow, save_file) save_file.close()
if s[0] != 224: img = misc.imresize(img, (224, 224), 'bilinear') img = np.stack((img,img,img),axis =2) yield img, label[0] from keras.models import load_model model = load_model('/media/user1/model.h5') test_features,test_label = relist(predict(test_set)) test_generator = Dataset( test_features, test_label, augment=False, shuffle=False, input_form='t1', seed=seed, ) test_generator.reset() test_results = evaluate.get_results(model, test_generator) probabilities = list(evaluate.transform_binary_probabilities(test_results)) np.save('./test_slice_pro.npy',probabilities) lg_pred = np.zeros((len(probabilities))) for i in range(len(probabilities)): if probabilities[i]<0.5: lg_pred[i] = 0 else: lg_pred[i] = 1
def main(): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = constants.GPUS torch_tvm.enable(opt_level=3, device_type="gpu", device="cuda", host="llvm") # Prepare input bc = BasketConstructor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR) # Users' baskets ub_basket = bc.get_baskets('prior', reconstruct=False) if constants.REORDER: # Users' reordered baskets ub_rbks = bc.get_baskets('prior', reconstruct=False, reordered=True) # User's item history ub_ihis = bc.get_item_history('prior', reconstruct=False) # Train test split train_ub, test_ub, train_rbks, test_rbks, train_ihis, test_ihis = train_test_split( ub_basket, ub_rbks, ub_ihis, test_size=0.2) del ub_basket, ub_rbks, ub_ihis # memory saving train_ub, test_ub = Dataset(train_ub, train_rbks, train_ihis), Dataset( test_ub, test_rbks, test_ihis) del train_rbks, test_rbks, train_ihis, test_ihis # memory saving else: train_ub, test_ub = train_test_split(ub_basket, test_size=0.2) del ub_basket train_ub, test_ub = Dataset(train_ub), Dataset(test_ub) # Model config dr_config = Config(constants.DREAM_CONFIG) dr_model = DreamModel(dr_config) if dr_config.cuda: dr_model.cuda() # Optimizer optim = torch.optim.Adam(dr_model.parameters(), lr=dr_config.learning_rate) # optim = torch.optim.Adadelta(dr_model.parameters()) # optim = torch.optim.SGD(dr_model.parameters(), lr=dr_config.learning_rate, momentum=0.9) writer = SummaryWriter(log_dir='runs/{}'.format( dr_config.alias)) # tensorboard writer writer.add_text('config', str(dr_config)) best_val_loss = None try: for k, v in constants.DREAM_CONFIG.items(): print(k, v) # training for epoch in range(dr_config.epochs): if constants.REORDER: train_reorder_dream() else: train_dream() print('-' * 89) if constants.REORDER: val_loss = evaluate_reorder_dream() else: val_loss = evaluate_dream() print('-' * 89) # checkpoint if not best_val_loss or val_loss < best_val_loss: with open( dr_config.checkpoint_dir.format(epoch=epoch, loss=val_loss), 'wb') as f: torch.save(dr_model, f) best_val_loss = val_loss else: # Manual SGD slow down lr if no improvement in val_loss # dr_config.learning_rate = dr_config.learning_rate / 4 pass except KeyboardInterrupt: print('*' * 89) print('Got keyboard Interrupt and stopped early')
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) train_data = Dataset(args.train_file) val_data = Dataset(args.val_file) train_sents = train_data.batch_size.sum() vocab_size = int(train_data.vocab_size) print('Train data: %d batches' % len(train_data)) print('Val data: %d batches' % len(val_data)) print('Word vocab size: %d' % vocab_size) if args.slurm == 0: # cuda.set_device(args.gpu) gpu_id = 0 device = torch.device( f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") if args.train_from == '': model = RNNVAE(vocab_size=vocab_size, enc_word_dim=args.enc_word_dim, enc_h_dim=args.enc_h_dim, enc_num_layers=args.enc_num_layers, dec_word_dim=args.dec_word_dim, dec_h_dim=args.dec_h_dim, dec_num_layers=args.dec_num_layers, dec_dropout=args.dec_dropout, latent_dim=args.latent_dim, mode=args.model) for param in model.parameters(): param.data.uniform_(-0.1, 0.1) else: print('loading model from ' + args.train_from) checkpoint = torch.load(args.train_from) model = checkpoint['model'] print("model architecture") print(model) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) if args.warmup == 0: args.beta = 1. else: args.beta = args.kl_start criterion = nn.NLLLoss(reduce=False) # criterion = nn.NLLLoss() # model.cuda() # criterion.cuda() # model = torch.nn.DataParallel(net, device_ids=[0, 1]) model.to(device) criterion.to(device) model.train() def variational_loss(input, sents, model, z=None): mean, logvar = input z_samples = model._reparameterize(mean, logvar, z) preds = model._dec_forward(sents, z_samples) nll = sum([ criterion(preds[:, l], sents[:, l + 1]) for l in range(preds.size(1)) ]) kl = utils.kl_loss_diag(mean, logvar) return nll + args.beta * kl update_params = list(model.dec.parameters()) meta_optimizer = OptimN2N(variational_loss, model, update_params, eps=args.eps, lr=[args.svi_lr1, args.svi_lr2], iters=args.svi_steps, momentum=args.momentum, acc_param_grads=args.train_n2n == 1, max_grad_norm=args.svi_max_grad_norm) if args.test == 1: args.beta = 1 test_data = Dataset(args.test_file) eval(args, test_data, model, meta_optimizer, device) exit() t = 0 best_val_nll = 1e5 best_epoch = 0 val_stats = [] epoch = 0 while epoch < args.num_epochs: start_time = time.time() epoch += 1 print('Starting epoch %d' % epoch) train_nll_vae = 0. train_nll_autoreg = 0. train_kl_vae = 0. train_nll_svi = 0. train_kl_svi = 0. train_kl_init_final = 0. num_sents = 0 num_words = 0 b = 0 for i in np.random.permutation(len(train_data)): if args.warmup > 0: args.beta = min( 1, args.beta + 1. / (args.warmup * len(train_data))) sents, length, batch_size = train_data[i] length = length.item() batch_size = batch_size.item() if args.gpu >= 0: # sents = sents.cuda() sents = sents.to(device) # batch_size = batch_size.to(device) b += 1 optimizer.zero_grad() if args.model == 'autoreg': preds = model._dec_forward(sents, None, True) tgt = sents[:, 1:].contiguous() nll_autoreg = criterion(preds.view(-1, preds.size(2)), tgt.view(-1)).view(preds.size(0), -1).sum(-1).mean(0) # nll_autoreg = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)]) train_nll_autoreg += nll_autoreg.item() * batch_size # train_nll_autoreg += nll_autoreg.data[0]*batch_size #old nll_autoreg.backward() elif args.model == 'svi': # mean_svi = Variable(0.1*torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad = True) # logvar_svi = Variable(0.1*torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad = True) mean_svi = Variable( 0.1 * torch.zeros(batch_size, args.latent_dim).to(device), requires_grad=True) logvar_svi = Variable( 0.1 * torch.zeros(batch_size, args.latent_dim).to(device), requires_grad=True) var_params_svi = meta_optimizer.forward( [mean_svi, logvar_svi], sents, b % args.print_every == 0) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final.detach(), logvar_svi_final.detach()) preds = model._dec_forward(sents, z_samples) tgt = sents[:, 1:].contiguous() nll_svi = criterion(preds.view(-1, preds.size(2)), tgt.view(-1)).view(preds.size(0), -1).sum(-1).mean(0) # nll_svi = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)]) train_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) train_kl_svi += kl_svi.data[0] * batch_size var_loss = nll_svi + args.beta * kl_svi var_loss.backward(retain_graph=True) else: mean, logvar = model._enc_forward(sents) z_samples = model._reparameterize(mean, logvar) preds = model._dec_forward(sents, z_samples) tgt = sents[:, 1:].contiguous() nll_vae = criterion(preds.view(-1, preds.size(2)), tgt.view(-1)).view(preds.size(0), -1).sum(-1).mean(0) # nll_vae = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)]) # train_nll_vae += nll_vae.data[0]*batch_size#old train_nll_vae += nll_vae.item() * batch_size kl_vae = utils.kl_loss_diag(mean, logvar) # train_kl_vae += kl_vae.data[0]*batch_size#old train_kl_vae += kl_vae.item() * batch_size if args.model == 'vae': vae_loss = nll_vae + args.beta * kl_vae vae_loss.backward(retain_graph=True) if args.model == 'savae': var_params = torch.cat([mean, logvar], 1) mean_svi = Variable(mean.data, requires_grad=True) logvar_svi = Variable(logvar.data, requires_grad=True) var_params_svi = meta_optimizer.forward( [mean_svi, logvar_svi], sents, b % args.print_every == 0) mean_svi_final, logvar_svi_final = var_params_svi z_samples = model._reparameterize(mean_svi_final, logvar_svi_final) preds = model._dec_forward(sents, z_samples) tgt = sents[:, 1:].contiguous() nll_svi = criterion(preds.view(-1, preds.size(2)), tgt.view(-1)).view(preds.size(0), -1).sum(-1).mean(0) # nll_svi = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)]) train_nll_svi += nll_svi.data[0] * batch_size kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final) train_kl_svi += kl_svi.data[0] * batch_size var_loss = nll_svi + args.beta * kl_svi var_loss.backward(retain_graph=True) if args.train_n2n == 0: if args.train_kl == 1: mean_final = mean_svi_final.detach() logvar_final = logvar_svi_final.detach() kl_init_final = utils.kl_loss( mean, logvar, mean_final, logvar_final) train_kl_init_final += kl_init_final.data[ 0] * batch_size kl_init_final.backward(retain_graph=True) else: vae_loss = nll_vae + args.beta * kl_vae var_param_grads = torch.autograd.grad( vae_loss, [mean, logvar], retain_graph=True) var_param_grads = torch.cat(var_param_grads, 1) var_params.backward(var_param_grads, retain_graph=True) else: var_param_grads = meta_optimizer.backward( [mean_svi_final.grad, logvar_svi_final.grad], b % args.print_every == 0) var_param_grads = torch.cat(var_param_grads, 1) var_params.backward(var_param_grads) if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) optimizer.step() num_sents += batch_size num_words += batch_size * length # num_sents = num_sents.item() # num_words = num_words.item() if b % args.print_every == 0: param_norm = sum([p.norm()**2 for p in model.parameters()]).data[0]**0.5 print( 'Iters: %d, Epoch: %d, Batch: %d/%d, LR: %.4f, TrainARPPL: %.2f, TrainVAE_PPL: %.2f, TrainVAE_KL: %.4f, TrainVAE_PPLBnd: %.2f, TrainSVI_PPL: %.2f, TrainSVI_KL: %.4f, TrainSVI_PPLBnd: %.2f, KLInitFinal: %.2f, |Param|: %.4f, BestValPerf: %.2f, BestEpoch: %d, Beta: %.4f, Throughput: %.2f examples/sec' % (t, epoch, b + 1, len(train_data), args.lr, np.exp(train_nll_autoreg / num_words), np.exp( train_nll_vae / num_words), train_kl_vae / num_sents, np.exp((train_nll_vae + train_kl_vae) / num_words), np.exp( train_nll_svi / num_words), train_kl_svi / num_sents, np.exp((train_nll_svi + train_kl_svi) / num_words), train_kl_init_final / num_sents, param_norm, best_val_nll, best_epoch, args.beta, num_sents / (time.time() - start_time))) print('--------------------------------') print('Checking validation perf...') val_nll = eval(args, val_data, model, meta_optimizer, device) val_stats.append(val_nll) # if val_elbo > self.best_val_elbo: # self.not_improved = 0 # self.best_val_elbo = val_elbo # else: # self.not_improved += 1 # if self.not_improved % 5 == 0: # self.current_lr = self.current_lr * self.config.options.lr_decay # print(f'New LR {self.current_lr}') # model.optimizer = torch.optim.SGD(model.parameters(), lr=self.current_lr) # model.enc_optimizer = torch.optim.SGD(model.parameters(), lr=self.current_lr) # model.dec_optimizer = torch.optim.SGD(model.parameters(), lr=self.current_lr) if val_nll < best_val_nll: not_improved = 0 best_save = '{}_{}.pt'.format(args.checkpoint_path, best_val_nll) if os.path.exists(best_save): os.remove(best_save) best_val_nll = val_nll best_epoch = epoch model.cpu() checkpoint = { 'args': args.__dict__, 'model': model, 'val_stats': val_stats } print('Savaeng checkpoint to %s' % args.checkpoint_path) best_save = '{}_{}.pt'.format(args.checkpoint_path, best_val_nll) torch.save(checkpoint, best_save) # model.cuda() model.to(device) else: not_improved += 1 if not_improved % 5 == 0: not_improved = 0 args.lr = args.lr * args.lr_decay print(f'New LR: {args.lr}') for param_group in optimizer.param_groups: param_group['lr'] = args.lr