def main(opt, device): if not opt.nlog and not opt.test: sys.stdout = Logger(Path(opt.save_dir) / 'log_.txt') print_argument_options(opt) #Configure cuda = device.type != 'cpu' init_torch_seeds() dataset = load_datasets(opt.data, opt.batch_size, cuda, opt.workers) trainloader, testloader = dataset.trainloader, dataset.testloader opt.num_classes = dataset.num_classes print("Creat dataset: {}".format(opt.data)) model = build_models(opt.model, opt.num_classes).to(device) print(model) if cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) print("Creat model: {}".format(opt.model)) if opt.test: acc, err = __testing(opt, model, testloader, 0, device) print("==> Train Accuracy (%): {}\t Error rate(%): {}".format( acc, err)) return criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, weight_decay=5e-04, momentum=0.9) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt.stepsize, gamma=opt.gamma) if opt.amp: opt.scaler = torch.cuda.amp.GradScaler(enabled=True) start_time = time.time() for epoch in range(opt.max_epoch): print("==> Epoch {}/{}".format(epoch + 1, opt.max_epoch)) __training(opt, model, criterion, optimizer, trainloader, epoch, device) scheduler.step() if opt.eval_freq > 0 and (epoch + 1) % opt.eval_freq == 0 or ( epoch + 1) == opt.max_epoch: acc, err = __testing(opt, model, trainloader, epoch, device) print("==> Train Accuracy (%): {}\t Error rate(%): {}".format( acc, err)) acc, err = __testing(opt, model, testloader, epoch, device) print("==> Test Accuracy (%): {}\t Error rate(%): {}".format( acc, err)) save_model(model, epoch, name=opt.model, save_dir=opt.save_dir) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
def train(run_id, set_name, model_name, loss_type, m, d, k, alpha, n_iterations=1000, net_learning_rate=0.0001, cluster_learning_rate=0.001, chunk_size=32, refresh_clusters=50, norm_clusters=False, calc_acc_every=10, load_latest=True, save_every=200, save_path=configs.general.paths.models, plot_every=100, plots_path=configs.general.paths.graphing, plots_ext='.png', n_plot_samples=10, n_plot_classes=10): # Setup model directory save_path = os.path.join(save_path, "%s" % run_id) os.makedirs(save_path, exist_ok=True) # Setup plotting directory plots_path = os.path.join(plots_path, "%s" % run_id) os.makedirs(plots_path, exist_ok=True) net, input_size = load_net(model_name) # Load set and get train and test labels from datasets train_dataset, test_dataset = load_datasets(set_name, input_size=input_size)#299 for inception train_y = get_labels(train_dataset) test_y = get_labels(test_dataset) # Use the GPU net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() cudnn.benchmark = True # make list of cluster refresh if given an interval int if isinstance(refresh_clusters, int): refresh_clusters = list(range(0, n_iterations, refresh_clusters)) # Get initial embedding using all samples in training set initial_reps = compute_all_reps(net, train_dataset, chunk_size) # Create loss object (this stores the cluster centroids) if loss_type == "magnet": the_loss = MagnetLoss(train_y, k, m, d, alpha=alpha) # Initialise the embeddings/representations/clusters print("Initialising the clusters") the_loss.update_clusters(initial_reps) # Setup the optimizer optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=net_learning_rate) optimizerb = None elif loss_type == "repmet" or loss_type == "repmet2" or loss_type == "repmet3" or loss_type == "myloss1": if loss_type == "repmet": the_loss = RepMetLoss(train_y, k, m, d, alpha=alpha) elif loss_type == "repmet2": the_loss = RepMetLoss2(train_y, k, m, d, alpha=alpha) elif loss_type == "repmet3": the_loss = RepMetLoss3(train_y, k, m, d, alpha=alpha) elif loss_type == "myloss1": the_loss = MyLoss1(train_y, k, m, d, alpha=alpha) # Initialise the embeddings/representations/clusters print("Initialising the clusters") the_loss.update_clusters(initial_reps) # Setup the optimizer if cluster_learning_rate < 0: optimizer = torch.optim.Adam(list(filter(lambda p: p.requires_grad, net.parameters())) + [the_loss.centroids], lr=net_learning_rate) optimizerb = None else: optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=net_learning_rate) optimizerb = torch.optim.Adam([the_loss.centroids], lr=cluster_learning_rate) l = os.listdir(save_path) if load_latest and len(l) > 1: l.sort(reverse=True) state = torch.load("%s/%s" % (save_path, l[1])) # ignore log.txt print("Loading model: %s/%s" % (save_path, l[1])) net.load_state_dict(state['state_dict']) optimizer.load_state_dict(state['optimizer']) if optimizerb: optimizerb.load_state_dict(state['optimizerb']) start_iteration = state['iteration']+1 best_acc = state['best_acc'] the_loss = state['the_loss'] # overwrite the loss plot_sample_indexs = state['plot_sample_indexs'] plot_classes = state['plot_classes'] plot_test_sample_indexs = state['plot_test_sample_indexs'] plot_test_classes = state['plot_test_classes'] batch_losses = state['batch_losses'] train_accs = state['train_accs'] test_accs = state['test_accs'] test_acc = test_accs[0][-1] train_acc = train_accs[0][-1] test_acc_b = test_accs[1][-1] train_acc_b = train_accs[1][-1] test_acc_c = test_accs[2][-1] train_acc_c = train_accs[2][-1] test_acc_d = test_accs[3][-1] train_acc_d = train_accs[3][-1] else: # Randomly sample the classes then the samples from each class to plot plot_sample_indexs, plot_classes = get_indexs(train_y, n_plot_classes, n_plot_samples) plot_test_sample_indexs, plot_test_classes = get_indexs(test_y, n_plot_classes, n_plot_samples, class_ids=plot_classes) batch_losses = [] train_accs = [[], [], [], []] test_accs = [[], [], [], []] start_iteration = 0 best_acc = 0 test_acc = 0 train_acc = 0 test_acc_b = 0 train_acc_b = 0 test_acc_c = 0 train_acc_c = 0 test_acc_d = 0 train_acc_d = 0 # lets plot the initial embeddings cluster_classes = the_loss.cluster_classes # use this to get indexs (indx to match cluster classes) for class ids (plot_classes) that we are plotting for i in range(len(cluster_classes)): cluster_classes[i] = the_loss.unique_y[cluster_classes[i]] cluster_indexs = [] for ci in range(len(the_loss.cluster_classes)): if the_loss.cluster_classes[ci] in plot_classes: cluster_indexs.append(ci) if not load_latest or len(l) < 2: # plot it graph(initial_reps[plot_sample_indexs], train_y[plot_sample_indexs], cluster_centers=ensure_numpy(the_loss.centroids)[cluster_indexs], cluster_classes=the_loss.cluster_classes[cluster_indexs], savepath="%s/emb-initial%s" % (plots_path, plots_ext)) # Get some sample indxs to do acc test on... compare these to the acc coming out of the batch calc test_train_inds,_ = get_indexs(train_y, len(set(train_y)), 10) # Lets setup the training loop iteration = None for iteration in range(start_iteration, n_iterations): # Sample batch and do forward-backward batch_example_inds, batch_class_inds = the_loss.gen_batch() # Get inputs and and labels from the dataset batch_x = get_inputs(train_dataset, batch_example_inds).cuda() batch_y = torch.from_numpy(batch_class_inds).cuda() # Calc the outputs (embs) and then the loss + accs outputs = net(batch_x) batch_loss, batch_example_losses, batch_acc = the_loss.loss(outputs, batch_y) # Pass the gradient and update optimizer.zero_grad() if optimizerb: optimizerb.zero_grad() batch_loss.backward() optimizer.step() if optimizerb: optimizerb.step() if norm_clusters: # Let's also normalise those centroids [because repmet pushes them away from unit sphere] to: # Option 1: sit on the hypersphere (use norm) # g = the_loss.centroids.norm(p=2,dim=0,keepdim=True) import torch.nn.functional as F the_loss.centroids.data = F.normalize(the_loss.centroids) # Option 2: sit on OR within the hypersphere (divide by max [scales all evenly])) # mx, _ = the_loss.centroids.max(0) # mx, _ = mx.max(0) # the_loss.centroids.data = the_loss.centroids/mx # What you wrote here doesn't work as scales axes independently... # Just changing some types batch_loss = float(ensure_numpy(batch_loss)) batch_example_losses = ensure_numpy(batch_example_losses) # Update loss index the_loss.update_losses(batch_example_inds, batch_example_losses) if iteration > 0 and not iteration % calc_acc_every: # calc all the accs train_reps = compute_reps(net, train_dataset, test_train_inds, chunk_size) test_test_inds, _ = get_indexs(test_y, len(set(test_y)), 10) test_reps = compute_reps(net, test_dataset, test_test_inds, chunk_size) test_acc = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='simple') train_acc = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='simple') test_acc_b = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='magnet') train_acc_b = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='magnet') test_acc_c = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='repmet') train_acc_c = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='repmet') # removed because of failed runs with out of mem errors # test_acc_d = the_loss.calc_accuracy(test_reps, test_y[test_test_inds], method='unsupervised') # train_acc_d = the_loss.calc_accuracy(train_reps, train_y[test_train_inds], method='unsupervised') test_acc_d = test_acc_c train_acc_d = train_acc_c with open(save_path+'/log.txt', 'a') as f: f.write("Iteration %06d/%06d: Tr. L: %0.3f :: Batch. A: %0.3f :::: Tr. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f :::: Te. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f\n" % (iteration, n_iterations, batch_loss, batch_acc, train_acc, train_acc_b, train_acc_c, train_acc_d, test_acc, test_acc_b, test_acc_c, test_acc_d)) print("Iteration %06d/%06d: Tr. L: %0.3f :: Batch. A: %0.3f :::: Tr. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f :::: Te. A - simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f" % (iteration, n_iterations, batch_loss, batch_acc, train_acc, train_acc_b, train_acc_c, train_acc_d, test_acc, test_acc_b, test_acc_c, test_acc_d)) batch_ass_ids = np.unique(the_loss.assignments[batch_example_inds]) os.makedirs("%s/batch-emb/" % plots_path, exist_ok=True) os.makedirs("%s/batch-emb-all/" % plots_path, exist_ok=True) os.makedirs("%s/batch-clusters/" % plots_path, exist_ok=True) graph(ensure_numpy(outputs), train_y[batch_example_inds], cluster_centers=ensure_numpy(the_loss.centroids)[batch_ass_ids], cluster_classes=the_loss.cluster_classes[batch_ass_ids], savepath="%s/batch-emb/i%06d%s" % (plots_path, iteration, plots_ext)) graph(ensure_numpy(outputs), train_y[batch_example_inds], cluster_centers=ensure_numpy(the_loss.centroids), cluster_classes=the_loss.cluster_classes, savepath="%s/batch-emb-all/i%06d%s" % (plots_path, iteration, plots_ext)) graph(np.zeros_like(ensure_numpy(outputs)), np.zeros_like(train_y[batch_example_inds]), cluster_centers=ensure_numpy(the_loss.centroids), cluster_classes=the_loss.cluster_classes, savepath="%s/batch-clusters/i%06d%s" % (plots_path, iteration, plots_ext)) train_reps_this_iter = False if iteration in refresh_clusters: with open(save_path+'/log.txt', 'a') as f: f.write('Refreshing clusters') print('Refreshing clusters') train_reps = compute_all_reps(net, train_dataset, chunk_size=chunk_size) the_loss.update_clusters(train_reps) cluster_classes = the_loss.cluster_classes train_reps_this_iter = True # store the stats to graph at end batch_losses.append(batch_loss) # batch_accs.append(batch_acc) train_accs[0].append(train_acc) test_accs[0].append(test_acc) train_accs[1].append(train_acc_b) test_accs[1].append(test_acc_b) train_accs[2].append(train_acc_c) test_accs[2].append(test_acc_c) train_accs[3].append(train_acc_d) test_accs[3].append(test_acc_d) if iteration > 0 and not iteration % plot_every: #use this to get indexs (indx to match cluster classes) for class ids (plot_classes) that we are plotting for i in range(len(cluster_classes)): cluster_classes[i] = the_loss.unique_y[cluster_classes[i]] # so 1. we don't have to recalc, 2. the kmeans update occured on these reps, better graphing ... # if we were to re-get with compute_reps(), batch norm and transforms could give different embeddings if train_reps_this_iter: plot_train_emb = train_reps[test_train_inds] else: plot_train_emb = compute_reps(net, train_dataset, test_train_inds, chunk_size=chunk_size) plot_test_emb = compute_reps(net, test_dataset, plot_test_sample_indexs, chunk_size=chunk_size) os.makedirs("%s/train-emb/" % plots_path, exist_ok=True) os.makedirs("%s/test-emb/" % plots_path, exist_ok=True) os.makedirs("%s/train-emb-all/" % plots_path, exist_ok=True) os.makedirs("%s/test-emb-all/" % plots_path, exist_ok=True) os.makedirs("%s/cluster-losses/" % plots_path, exist_ok=True) os.makedirs("%s/cluster-counts/" % plots_path, exist_ok=True) graph(plot_train_emb, train_y[plot_sample_indexs], cluster_centers=ensure_numpy(the_loss.centroids)[cluster_indexs], cluster_classes=the_loss.cluster_classes[cluster_indexs], savepath="%s/train-emb/i%06d%s" % (plots_path, iteration, plots_ext)) graph(plot_test_emb, test_y[plot_test_sample_indexs], cluster_centers=ensure_numpy(the_loss.centroids)[cluster_indexs], cluster_classes=the_loss.cluster_classes[cluster_indexs], savepath="%s/test-emb/i%06d%s" % (plots_path, iteration, plots_ext)) graph(plot_train_emb, # train_y[plot_sample_indexs], train_y[test_train_inds], cluster_centers=ensure_numpy(the_loss.centroids), cluster_classes=the_loss.cluster_classes, savepath="%s/train-emb-all/i%06d%s" % (plots_path, iteration, plots_ext)) graph(plot_test_emb, test_y[plot_test_sample_indexs], cluster_centers=ensure_numpy(the_loss.centroids), cluster_classes=the_loss.cluster_classes, savepath="%s/test-emb-all/i%06d%s" % (plots_path, iteration, plots_ext)) plot_smooth({'loss': batch_losses, 'train acc': train_accs[0], 'test acc': test_accs[0]}, savepath="%s/loss_simple%s" % (plots_path, plots_ext)) plot_smooth({'loss': batch_losses, 'train acc': train_accs[1], 'test acc': test_accs[1]}, savepath="%s/loss_magnet%s" % (plots_path, plots_ext)) plot_smooth({'loss': batch_losses, 'train acc': train_accs[2], 'test acc': test_accs[2]}, savepath="%s/loss_repmet%s" % (plots_path, plots_ext)) # plot_smooth({'loss': batch_losses, # 'train acc': train_accs[3], # 'test acc': test_accs[3]}, # savepath="%s/loss_unsupervised%s" % (plots_path, plots_ext)) plot_cluster_data(the_loss.cluster_losses, the_loss.cluster_classes, title="cluster losses", savepath="%s/cluster-losses/i%06d%s" % (plots_path, iteration, plots_ext)) cluster_counts = [] for c in range(len(the_loss.cluster_assignments)): cluster_counts.append(len(the_loss.cluster_assignments[c])) plot_cluster_data(cluster_counts, the_loss.cluster_classes, title="cluster counts", savepath="%s/cluster-counts/i%06d%s" % (plots_path, iteration, plots_ext)) if iteration > 0 and not iteration % save_every: if save_path: if test_acc_d > best_acc: print("Saving model (is best): %s/i%06d%s" % (save_path, iteration, '.pth')) best_acc = test_acc_d else: print("Saving model: %s/i%06d%s" % (save_path, iteration, '.pth')) state = { 'iteration': iteration, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': test_acc_d, 'best_acc': best_acc, 'the_loss': the_loss, 'plot_sample_indexs': plot_sample_indexs, 'plot_classes': plot_classes, 'plot_test_sample_indexs': plot_test_sample_indexs, 'plot_test_classes': plot_test_classes, 'batch_losses': batch_losses, 'train_accs': train_accs, 'test_accs': test_accs, } if optimizerb: state['optimizerb'] = optimizerb.state_dict() torch.save(state, "%s/i%06d%s" % (save_path, iteration, '.pth')) # END TRAINING LOOP # Plot curves and graphs plot_smooth({'loss': batch_losses, 'train acc': train_accs[0], 'test acc': test_accs[0]}, savepath="%s/loss_simple%s" % (plots_path, plots_ext)) plot_smooth({'loss': batch_losses, 'train acc': train_accs[1], 'test acc': test_accs[1]}, savepath="%s/loss_magnet%s" % (plots_path, plots_ext)) plot_smooth({'loss': batch_losses, 'train acc': train_accs[2], 'test acc': test_accs[2]}, savepath="%s/loss_repmet%s" % (plots_path, plots_ext)) plot_smooth({'loss': batch_losses, 'train acc': train_accs[3], 'test acc': test_accs[3]}, savepath="%s/loss_unsupervised%s" % (plots_path, plots_ext)) # Calculate and graph the final final_reps = compute_reps(net, train_dataset, plot_sample_indexs, chunk_size=chunk_size) graph(final_reps, train_y[plot_sample_indexs], savepath="%s/emb-final%s" % (plots_path, plots_ext)) if save_path and iteration: if test_acc_d > best_acc: print("Saving model (is best): %s/i%06d%s" % (save_path, iteration+1, '.pth')) best_acc = test_acc_d else: print("Saving model: %s/i%06d%s" % (save_path, iteration+1, '.pth')) state = { 'iteration': iteration, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'acc': test_acc_d, 'best_acc': best_acc, 'the_loss': the_loss, 'plot_sample_indexs': plot_sample_indexs, 'plot_classes': plot_classes, 'plot_test_sample_indexs': plot_test_sample_indexs, 'plot_test_classes': plot_test_classes, 'batch_losses': batch_losses, 'train_accs': train_accs, 'test_accs': test_accs, } if optimizerb: state['optimizerb'] = optimizerb.state_dict() torch.save(state, "%s/i%06d%s" % (save_path, iteration+1, '.pth'))
def evaluate(run_id, set_name, model_name, chunk_size=32, split='test', load_iteration=-1, load_path=configs.general.paths.models, plots_path=configs.general.paths.graphing, plots_ext='.png'): # Setup load path load_path = os.path.join(load_path, "%s" % run_id) # Setup plotting directory plots_path = os.path.join(plots_path, "%s" % run_id) os.makedirs(plots_path, exist_ok=True) net, input_size = load_net(model_name) # Load set and get train and test labels from datasets train_dataset, test_dataset = load_datasets(set_name, input_size=input_size) if split == 'train': dataset = train_dataset else: dataset = test_dataset y = get_labels(dataset) # Use the GPU net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) net.cuda() cudnn.benchmark = True # Load the particular iteration we want if load_iteration < 0: l = os.listdir(load_path) l.sort(reverse=True) state = torch.load("%s/%s" % (load_path, l[1])) # ignore log.txt print("Loading model: %s/%s" % (load_path, l[1])) else: if os.path.exists("%s/i%06d%s" % (load_path, load_iteration, '.pth')): state = torch.load( "%s/i%06d%s" % (load_path, load_iteration, '.pth')) # ignore log.txt print("%s/i%06d%s" % (load_path, load_iteration, '.pth')) else: print("%s/i%06d%s doesn't exist... awkies. :/" % (load_path, load_iteration, '.pth')) return # Load the net state net.load_state_dict(state['state_dict']) # Load the loss and cluster centres the_loss = state['the_loss'] # Compute the embeddings fof the dataset x = compute_reps(net, dataset, list(range(len(y))), chunk_size=chunk_size) # Compute the accuracies test_acc = the_loss.calc_accuracy(x, y, method='simple') test_acc_b = the_loss.calc_accuracy(x, y, method='magnet') test_acc_c = the_loss.calc_accuracy(x, y, method='repmet') test_acc_d = the_loss.calc_accuracy(x, y, method='unsupervised') print( "simple: %0.3f -- magnet: %0.3f -- repmet: %0.3f -- unsupervised: %0.3f" % (test_acc, test_acc_b, test_acc_c, test_acc_d)) # And hey, why not graph them all! graph(x, y, cluster_centers=ensure_numpy(the_loss.centroids), cluster_classes=the_loss.cluster_classes, savepath="%s/test-%s%s" % (plots_path, split, plots_ext))
def main(args): # load train/test data datadir = os.path.join(args.volumedir, args.datadir) # train = imdb_data_load(datadir) train, test = load_datasets(datadir) # train, test = load_context_target_pairs(datadir, context_len = args.conlength) # train = sorted(train, key=lambda a: len(a), reverse=True) # train = train[:min(len(train), args.datacap)] # for msg in train: # if "roster" in msg: # print(msg) # return # Dynamically load modelBuilder class moduleName, klassName = args.modelbuilder.split(".") mod = __import__('models.%s' % moduleName, fromlist=[klassName]) klass = getattr(mod, klassName) modelBuilder = klass(args) timestamp = int(time.time()) logdir = os.path.join(args.volumedir, datetime.datetime.today().strftime('%Y%m%d'), args.logdir) if not os.path.isdir(logdir): os.makedirs(logdir) hdlr = logging.FileHandler( os.path.join(logdir, "training_output_%d.log" % timestamp)) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.INFO) checkpointdir = os.path.join(args.volumedir, datetime.datetime.today().strftime('%Y%m%d'), args.checkpointdir) if not os.path.isdir(checkpointdir): os.makedirs(checkpointdir) checkpointpath = configure_checkpointing(args, timestamp) checkpoint_callback = ModelCheckpoint(filepath=checkpointpath, save_weights_only=False) # Create or load existing model init_epoch = 0 if args.textlineds: X, Y, vocab, tokens = SlackTextLineDataset(args, train).get_dataset() reverse_token_map = {t: i for i, t in enumerate(vocab)} else: tokens, vocab, reverse_token_map = modelBuilder.tokenize( train, freq_threshold=args.freqthreshold) # text_ds = text_ds.shuffle(buffer_size=1024).batch(args.minibatchsize) # print(text_ds.cardinality().numpy()) if args.loadmodel and os.path.exists(args.loadmodel): modelpath = args.loadmodel timestamp = int(modelpath.split(".")[1]) init_epoch = int(modelpath.split(".")[2]) loaddir = "/".join(modelpath.split("/")[:-1]) model = load_model(modelpath, custom_objects={"EinsumOp": EinsumOp}) vocab = load_vocab(loaddir, timestamp) # tokens = load_tokens(loaddir, timestamp) reverse_token_map = {t: i for i, t in enumerate(vocab)} else: model = modelBuilder.create_model(vocab) save_vocab(vocab, checkpointdir, timestamp) if args.savetokens: save_tokens(tokens, checkpointdir, timestamp) plot_model(model, to_file='model_plot_2.png', show_shapes=True, show_layer_names=True) optimizer_map = {"adam": Adam, "rmsprop": RMSprop, "sgd": SGD} optimizer = optimizer_map[ args.optimizer] if args.optimizer in optimizer_map.keys() else RMSprop lr_decay = ExponentialDecay(initial_learning_rate=args.learningrate, decay_rate=args.decayrate, decay_steps=args.decaysteps) custom_lr = CustomSchedule(args.hiddensize) opt = optimizer(learning_rate=lr_decay, clipvalue=3) # model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy"]) # attn_4_output = model.get_layer("attention_values_4").output # dense_v_out = model.get_layer("dense_v_4").output # einsum_com_output = model.get_layer("einsum_com_4").output # inpt = model.get_layer("input") # attn_factor_model = keras.Model(inputs=inpt.input, outputs=attn_4_output) # einsum_com_model = keras.Model(inputs=inpt.input, outputs=einsum_com_output) # dense_v_model = keras.Model(inputs=inpt.input, outputs=dense_v_out) model.compile( loss=keras.losses.SparseCategoricalCrossentropy(name="loss"), run_eagerly=True, optimizer=opt, metrics=[ tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"), tf.keras.metrics.SparseTopKCategoricalAccuracy( k=3, name="top_3_accuracy"), tf.keras.metrics.SparseTopKCategoricalAccuracy( k=5, name="top_5_accuracy"), last_word_prediction_accuracy(args.minibatchsize, args.seqlength) ]) # last_word_prediction_topk_accuracy(args.minibatchsize, args.seqlength, 5)]) model.summary(print_fn=logger.info) checkpointnames = args.checkpointnames % timestamp sample_func = lambda: modelBuilder.sample(model, tokens, vocab, reverse_token_map) callbacks = get_callbacks(args.volumedir, checkpointdir, checkpointnames, timestamp, sample_func) sample_callback = LambdaCallback( on_epoch_end=lambda epoch, logs: sample_func()) logger_callback = LambdaCallback(on_epoch_end=lambda epoch, logs: logger. info("Epoch %d: %s" % (epoch, str(logs)))) if not args.textlineds: trainseqs = modelBuilder.get_input_sequences(tokens, reverse_token_map) # trainseqs, valseqs = validation_split(seqs, val_split=args.valsplit) if args.modelbuilder == "keras_word_lm.WordLanguageModelBuilder": trainvectors = SequenceVectors(args, trainseqs, vocab) history = model.fit(trainvectors, epochs=args.numepochs, initial_epoch=init_epoch, callbacks=[ sample_callback, logger_callback, checkpoint_callback ]) logger.info(history.history) plot_history(history.history, args.learningrate, logdir, timestamp) return X, Y, sample_weights = modelBuilder.build_input_vectors( trainseqs, vocab, reverse_token_map) # ds = modelBuilder.build_input_vectors(trainseqs, vocab, reverse_token_map) # model.fit(X, Y, # print(ds) # start_prompt = "this movie is" # start_tokens = [reverse_token_map[t] for t in start_prompt.split()] # num_tokens_generated = 40 # text_gen_callback = TextGenerator(num_tokens_generated, args.seqlength, start_tokens, vocab) history = model.fit( X, Y, epochs=args.numepochs, initial_epoch=init_epoch, batch_size=args.minibatchsize, validation_split=0.1, shuffle=True, callbacks=[sample_callback, logger_callback, checkpoint_callback]) logger.info(history.history) plot_history(history.history, args.learningrate, logdir, timestamp) return allmetrics = {} for epoch in range(init_epoch, args.numepochs): batches = rand_mini_batches(trainseqs, args.minibatchsize) for i, batch in enumerate(batches): X, Y, sample_weights = modelBuilder.build_input_vectors( batch, vocab, reverse_token_map) metrics = model.train_on_batch(X, Y, sample_weight=sample_weights, reset_metrics=i == 0, return_dict=True) if i % 100 == 0: valmetrics = evaluate_mini_batches(model, modelBuilder, vocab, reverse_token_map, valseqs, args.minibatchsize) metrics.update(valmetrics) for key in metrics.keys(): if key in allmetrics.keys(): allmetrics[key] += [metrics[key]] else: allmetrics[key] = [metrics[key]] print("Batch %d of %d in epoch %d: %s" % (i, len(batches), epoch, str(metrics))) logger.info("Epoch %d: %s" % (epoch, str(metrics))) # logger.info("Validation metrics %s" % str(valmetrics)) if args.runsamples: sample_output = sample_func() logger.info("\n" + sample_output) model.save( os.path.join(checkpointdir, checkpointnames).format(epoch=epoch)) plot_history(allmetrics, args.learningrate, logdir, timestamp)
def main(opt, device): best_acc1 = 0 if not opt.nlog and not opt.test: sys.stdout = Logger(Path(opt.save_dir) / 'log_.txt') if opt.global_rank in [-1, 0]: print_argument_options(opt) #Configure cuda = device.type != 'cpu' init_torch_seeds() dataset = load_datasets(opt.data, opt.batch_size, cuda, opt.workers, opt.global_rank) trainloader, testloader = dataset.trainloader, dataset.testloader opt.num_classes = dataset.num_classes if opt.global_rank in [-1, 0]: print("Creat dataset: {}".format(opt.data)) model = build_models(opt.model, opt.num_classes, opt.input_size, opt.model_size).to(device) if cuda and opt.global_rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) if cuda and opt.global_rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) if opt.global_rank in [-1, 0]: print(model) print("Creat model: {}".format(opt.model)) criterion = nn.CrossEntropyLoss() #criterion = SmoothCrossEntropyLoss(label_smoothing=0.1) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, weight_decay=5e-04, momentum=0.9) if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) opt.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if opt.global_rank in [-1, 0]: print("=> loaded checkpoint '{}' (epoch {})".format( opt.resume, checkpoint['epoch'])) else: if opt.global_rank in [-1, 0]: print("=> no checkpoint found at '{}'".format(opt.resume)) opt.scaler = torch.cuda.amp.GradScaler(enabled=True) if opt.global_rank in [-1, 0]: start_time = time.time() for epoch in range(opt.start_epoch, opt.max_epoch): if opt.global_rank != -1: trainloader.sampler.set_epoch(epoch) if opt.global_rank in [-1, 0]: print("==> Epoch {}/{}".format(epoch + 1, opt.max_epoch)) __training(opt, model, criterion, optimizer, trainloader, epoch, device, opt.global_rank) if opt.eval_freq > 0 and (epoch + 1) % opt.eval_freq == 0 or ( epoch + 1) == opt.max_epoch: #if cuda and opt.global_rank != -1: # model.module.inference_mode() #else: # model.inference_mode() acc1 = __testing(opt, model, testloader, epoch, device, opt.global_rank) #if cuda and opt.global_rank != -1: # model.module.training_mode() #else: # model.training_mode() acc1 = __testing(opt, model, testloader, epoch, device, opt.global_rank) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if opt.global_rank in [-1, 0]: save_checkpoint( { 'epoch': epoch + 1, 'arch': opt.model, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, save_dir=opt.save_dir) if opt.global_rank in [-1, 0]: elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
def main(args): volumedir = args.volumedir datadir = os.path.join(volumedir, args.datadir) checkpointdir = os.path.join(volumedir, args.checkpointdir) checkpointnames = args.checkpointnames mini_batch_size = args.minibatchsize learning_rate = args.learningrate dropout_rate = args.dropoutrate reg_factor = args.regfactor n_a = args.hiddensize num_epochs = args.numepochs loadmodel = args.loadmodel timestamp = int(time.time()) checkpointnames = checkpointnames % timestamp train, test = load_datasets(datadir) train = train[:min(len(train), args.datacap)] m = len(train) chars = set() step = args.step maxlen = args.seqlength #for msg in train: # chars = chars.union(set(msg)) #chars = sorted(list(chars)) #chars = ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '*'] chars = [ '\n', ' ', '!', '"', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '*' ] char_to_ix = {c: i for i, c in enumerate(chars)} if loadmodel and os.path.exists(loadmodel): timestamp = int(loadmodel.split(".")[1]) epoch_number = int(loadmodel.split(".")[2]) model = load_checkpoint_model(loadmodel) else: model = create_seq2seq_model(chars, n_a, maxlen, learning_rate, dropout_rate=dropout_rate, reg_factor=reg_factor) epoch_number = 0 hdlr = logging.FileHandler( os.path.join(volumedir, "training_output_%d.log" % timestamp)) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.INFO) metrics = [] X, Y = format_x_y_no_seed(maxlen, chars, train, step, char_to_ix) callbacks = get_callbacks(volumedir, checkpointdir, checkpointnames, chars, char_to_ix, train, model, timestamp) model.fit(X, Y, batch_size=mini_batch_size, epochs=num_epochs, initial_epoch=epoch_number, validation_split=0.2, shuffle=True, callbacks=callbacks)
def train_model(batch_size, n_epochs, learning_rate, saved_epoch, run_id="def", set_name="stanford_dogs", save_every=1000, save_path=configs.models, plot_every=500, plot_path=configs.plots): # Setup save directories if save_path: save_path = os.path.join(save_path, "run_{}".format(run_id)) os.makedirs(save_path, exist_ok=True) if plot_path: plot_path = os.path.join(plot_path, "run_{}".format(run_id)) os.makedirs(plot_path, exist_ok=True) # Load network and use GPU net = models.Net2().cuda() cudnn.benchmark = True # Load dataset train_data, test_data, classes = load_datasets(set_name) #train_y, test_y = utils.get_labels(train_data), utils.get_labels(test_data) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True) # obtain one batch of training images dataiter = iter(train_loader) images, labels = dataiter.next() images = np.swapaxes(np.swapaxes(images.numpy(), 1, 2), 2, 3) # plot the images in the batch, along with the corresponding labels fig = plt.figure(figsize=(batch_size / 4 + 5, batch_size / 4 + 5)) for idx in np.arange(batch_size): ax = fig.add_subplot(batch_size / 8, 8, idx + 1, xticks=[], yticks=[]) ax.imshow(images[idx]) ax.set_title(classes[labels[idx]], {'fontsize': batch_size / 5}, pad=0.4) plt.tight_layout(pad=1, w_pad=0, h_pad=0) if plot_path: plt.savefig(os.path.join(plot_path, "Initial_Visualization")) else: plt.show() plt.clf() # cross entropy loss combines softmax and nn.NLLLoss() in one single class. criterion = nn.NLLLoss() # stochastic gradient descent with a small learning rate optimizer = optim.SGD(net.parameters(), lr=learning_rate) # ToDo: Add to utils # Calculate accuracy before training correct = 0 total = 0 # Iterate through test dataset for images, labels in test_loader: images, labels = images.cuda(), labels.cuda() # forward pass to get outputs # the outputs are a series of class scores outputs = net(images) # get the predicted class from the maximum value in the output-list of class scores _, predicted = torch.max(outputs.data, 1) # count up total number of correct labels # for which the predicted and true labels are equal total += labels.size(0) correct += (predicted == labels).sum() # calculate the accuracy # to convert `correct` from a Tensor into a scalar, use .item() accuracy = 100.0 * correct.item() / total # print('Accuracy before training: ', accuracy) def train(n_epochs): net.train() loss_over_time = [] # to track the loss as the network trains for epoch in range(n_epochs): # loop over the dataset multiple times output_epoch = epoch + saved_epoch running_loss = 0.0 for batch_i, data in enumerate(train_loader): # get the input images and their corresponding labels inputs, labels = data inputs, labels = inputs.cuda(), labels.cuda() # zero the parameter (weight) gradients optimizer.zero_grad() # forward pass to get outputs outputs = net(inputs) # calculate the loss loss = criterion(outputs, labels) # backward pass to calculate the parameter gradients loss.backward() # update the parameters optimizer.step() # print loss statistics # to convert loss into a scalar and add it to running_loss, we use .item() running_loss += loss.item() if batch_i % 45 == 44: # print every 45 batches avg_loss = running_loss / 45 # record and print the avg loss over the 100 batches loss_over_time.append(avg_loss) print('Epoch: {}, Batch: {}, Avg. Loss: {}'.format( output_epoch + 1, batch_i + 1, avg_loss)) running_loss = 0.0 if output_epoch % 100 == 99: # save every 100 epochs torch.save(net.state_dict(), 'saved_models/Net2_{}.pt'.format(output_epoch + 1)) print('Finished Training') return loss_over_time if saved_epoch: net.load_state_dict( torch.load('saved_models/Net2_{}.pt'.format(saved_epoch))) # call train and record the loss over time training_loss = train(n_epochs) # visualize the loss as the network trained fig = plt.figure() plt.plot(45 * np.arange(len(training_loss)), training_loss) plt.rc('xtick', labelsize=12) plt.rc('ytick', labelsize=12) plt.xlabel('Number of Batches', fontsize=12) plt.ylabel('loss', fontsize=12) plt.ylim(0, 5.5) # consistent scale plt.tight_layout() if plot_path: plt.savefig(os.path.join(plot_path, "Loss_Over_Time")) print("saved") else: plt.show() plt.clf() # initialize tensor and lists to monitor test loss and accuracy test_loss = torch.zeros(1).cuda() class_correct = list(0. for i in range(len(classes))) class_total = list(0. for i in range(len(classes))) # set the module to evaluation mode # used to turn off layers that are only useful for training # like dropout and batch_norm net.eval() for batch_i, data in enumerate(test_loader): # get the input images and their corresponding labels inputs, labels = data inputs, labels = inputs.cuda(), labels.cuda() # forward pass to get outputs outputs = net(inputs) # calculate the loss loss = criterion(outputs, labels) # update average test loss test_loss = test_loss + ((torch.ones(1).cuda() / (batch_i + 1)) * (loss.data - test_loss)) # get the predicted class from the maximum value in the output-list of class scores _, predicted = torch.max(outputs.data, 1) # compare predictions to true label # this creates a `correct` Tensor that holds the number of correctly classified images in a batch correct = np.squeeze(predicted.eq(labels.data.view_as(predicted))) # calculate test accuracy for *each* object class # we get the scalar value of correct items for a class, by calling `correct[i].item()` for l, c in zip(labels.data, correct): class_correct[l] += c.item() class_total[l] += 1 print('Test Loss: {:.6f}\n'.format(test_loss.cpu().numpy()[0])) for i in range(len(classes)): if class_total[i] > 0: print('Test Accuracy of %30s: %2d%% (%2d/%2d)' % (classes[i], 100 * class_correct[i] / class_total[i], np.sum(class_correct[i]), np.sum(class_total[i]))) else: print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i])) print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (100. * np.sum(class_correct) / np.sum(class_total), np.sum(class_correct), np.sum(class_total))) # Visualize Sample Results (Runs until a batch contains a ) # plot the images in the batch, along with predicted and true labels fig = plt.figure(figsize=(batch_size / 4 + 5, batch_size / 4 + 5)) misclassification_found = False while (not misclassification_found): fig.clf() # obtain one batch of test images dataiter = iter(test_loader) images, labels = dataiter.next() images, labels = images.cuda(), labels.cuda() # get predictions preds = np.squeeze( net(images).data.max(1, keepdim=True)[1].cpu().numpy()) images = np.swapaxes(np.swapaxes(images.cpu().numpy(), 1, 2), 2, 3) for idx in np.arange(batch_size): ax = fig.add_subplot(batch_size / 8, 8, idx + 1, xticks=[], yticks=[]) ax.imshow(images[idx]) if preds[idx] == labels[idx]: ax.set_title("{}".format(classes[preds[idx]], classes[labels[idx]]), color="green") else: ax.set_title("({})\n{}".format(classes[labels[idx]], classes[preds[idx]]), color="red", pad=.4) misclassification_found = True if plot_path: plt.savefig(os.path.join(plot_path, "Results Visualization")) else: plt.show() plt.clf()