def create_post(node): operation = request.GET.get("operation") uci_model = client.get_uci_config() parent = uci_model.find_child(node) if isinstance(parent, uci_raw.Section): if operation == "add-list": form = UciRawForm(uci_raw.List, editable_key=True) if form.validates(request.POST): new_element = form.to_model() elif operation == "add-option": form = UciRawForm(uci_raw.Option, editable_key=True) if form.validates(request.POST): new_element = form.to_model() else: raise ValueError( "Requested operation not allowed for Section node.") elif isinstance(parent, uci_raw.Config): form = UciRawForm(uci_raw.Section, editable_key=True)(request.POST) if form.validates(request.POST): new_element = form.to_model() elif isinstance(parent, uci_raw.List): form = UciRawForm(uci_raw.Value, editable_key=True)(request.POST) if form.validates(request.POST): new_element = form.to_model() else: raise ValueError("New node cannot be created here.") if not form.valid: return dict(node_path=node, form=form) new_element.operation = "create" parent.add(new_element) print_model(new_element) edit_uci_config(new_element) bottle.redirect(reverse("uci_index"))
def create_post(node): operation = request.GET.get("operation") uci_model = client.get_uci_config() parent = uci_model.find_child(node) if isinstance(parent, uci_raw.Section): if operation == "add-list": form = UciRawForm(uci_raw.List, editable_key=True) if form.validates(request.POST): new_element = form.to_model() elif operation == "add-option": form = UciRawForm(uci_raw.Option, editable_key=True) if form.validates(request.POST): new_element = form.to_model() else: raise ValueError("Requested operation not allowed for Section node.") elif isinstance(parent, uci_raw.Config): form = UciRawForm(uci_raw.Section, editable_key=True)(request.POST) if form.validates(request.POST): new_element = form.to_model() elif isinstance(parent, uci_raw.List): form = UciRawForm(uci_raw.Value, editable_key=True)(request.POST) if form.validates(request.POST): new_element = form.to_model() else: raise ValueError("New node cannot be created here.") if not form.valid: return dict(node_path=node, form=form) new_element.operation = "create" parent.add(new_element) print_model(new_element) edit_uci_config(new_element) bottle.redirect(reverse("uci_index"))
def mnist_utilizando_cnn_simples(): (X_train, y_train), (X_test, y_test) = load_mnist_dataset('mnist.npz') # transformar para o formato [instancias][pixeis][largura][altura] X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32') X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32') # normalizar os valores dos pixeis de 0-255 para 0-1 X_train = X_train / 255 X_test = X_test / 255 # transformar o label que é um inteiro em categorias binárias, o valor passa a ser o correspondente à posição # o 5 passa a ser a lista [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test) num_classes = y_test.shape[1] # definir a topologia da rede e compilar model = create_compile_model_cnn_simples(num_classes) utils.print_model(model, "model_simples.png") # treinar a rede history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=100, verbose=2) #print_history_accuracy(history) utils.print_history_loss(history) # Avaliação final com os casos de teste scores = model.evaluate(X_test, y_test, verbose=0) print('Scores: ', scores) print("Erro modelo MLP: %.2f%%" % (100 - scores[1] * 100))
def ciclo_completo(): (input_attributes, output_attributes) = read_cvs_dataset("pima-indians-diabetes.csv", 8) model = create_model() utils.print_model(model, "model_MLP.png") compile_model(model) history = fit_model(model, input_attributes, output_attributes) utils.print_history_loss(history) model_evaluate(model, input_attributes, output_attributes) model_print_predictions(model, input_attributes, output_attributes)
def train(model, v_emb, q_emb, groundtruth, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 * 0.5 lr_decay_step = 2 lr_decay_rate = .25 lr_decay_epochs = range(10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) v_emb = v_emb.cuda() q_emb = q_emb.cuda() for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 t = time.time() N = 0 if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) gw = model(v_emb, q_emb) loss = gw print(loss) loss.backward() optim.step() optim.zero_grad()
def ciclo_ler_dataset_treinar_gravar(): (input_attributes, output_attributes) = read_cvs_dataset("pima-indians-diabetes.csv", 8) model = create_model() utils.print_model(model, "model2.png") compile_model(model) history = fit_model(model, input_attributes, output_attributes) utils.print_history_accuracy(history) utils.print_history_loss(history) model_evaluate(model, input_attributes, output_attributes) utils.save_model_json(model, "model.json") utils.save_weights_hdf5(model, "model.h5") return (input_attributes, output_attributes)
def train_foil(model, train_loader, eval_loader, num_epochs, output, lr): utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_accuracy = 0 utils.print_model(model, logger) logger.write('optim: adam lr=%.4f' % lr) for epoch in range(num_epochs): print("Epoch {}".format(epoch)) total_loss = 0 train_score = 0 t = time.time() N = len(train_loader.dataset) bar = progressbar.ProgressBar(max_value=N) idx = 0 for i, (v, b, q, a) in enumerate(train_loader): model.train(True) bar.update(idx) batch_size = v.size(0) v = Variable(v).cuda() b = Variable(b).cuda() q = Variable(q).cuda() a = Variable(a).cuda() idx += batch_size pred, att = model(v, b, q, a) loss = instance_bce_with_logits(pred, a) optim.zero_grad() loss.backward() optim.step() batch_score = compute_accuracy_with_logits(pred, a.data) total_loss += loss.data[0] * v.size(0) train_score += batch_score bar.update(idx) total_loss /= N train_score = 100 * train_score / N if eval_loader is not None: model.train(False) eval_score = evaluate_foil(model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, score: %.2f' % (total_loss, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f' % (100 * eval_score)) if eval_loader is not None and eval_score > best_eval_accuracy: model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_accuracy = eval_score
def main_quantified_TransE(): # Define some hyper-parameters for training emb_dim = 100 lr = 0.0004 margin = 0.5 n_epochs = 1000 batch_size = 2097152 # Load dataset data_path = "/tmp/pycharm_project_583/data/uncmtrd/agg6_202005_ALL_tv.csv" kg_train, kg_val, kg_test = load_custom_qr(data_path=data_path) model = TransEQuantifiedRelations( emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type="L2" ) print_model(model) # check we only have two embedding layers - one for entity, the other for relations dataset_name = data_path.split('/')[-1].replace('.csv', '') curr_time = datetime.now().strftime('%Y%m%d%H%M%S') model_prefix = os.path.join('./pretrained', f'{dataset_name}_emb{emb_dim}_lr{lr}_mgn{margin}_epch{n_epochs}_bsize{batch_size}_t{curr_time}') criterion = MarginLoss(margin) optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5) trainer = Trainer( model, criterion, kg_train, n_epochs, batch_size, optimizer=optimizer, sampling_type="bern", use_cuda=None, ) trainer.run(kg_test=kg_test, model_prefix=model_prefix)
def LSTM_sales_data(normalizer=None): df, scaler = get_data(normalizer=normalizer) print("Dataset: ", df.shape) janela = 6 #tamanho da Janela deslizante (trimestral, mensal, semestral) X_train, y_train, X_test, y_test = split_data(df, janela) print("X_train", X_train.shape) print("y_train", y_train.shape) print("X_test", X_test.shape) print("y_test", y_test.shape) model = build_model(janela) model.fit(X_train, y_train, batch_size=10, epochs=300, validation_split=0.1, verbose=1) #validation 0.1 dos 0.66 usados para treino utils.print_model(model,"lstm_model.png") trainScore = model.evaluate(X_train, y_train, verbose=0) print('\n Train Score: %.2f MSE (%.2f RMSE)' % (trainScore[0], math.sqrt(trainScore[0]))) testScore = model.evaluate(X_test, y_test, verbose=0) print(' Test Score: %.2f MSE (%.2f RMSE)' % (testScore[0], math.sqrt(testScore[0]))) print('\n****************** UNSCALED*******************') # Unscale Results to get real value predictions and error trainScore = trainScore[0].reshape(-1, 1).astype('float32') unscaled_Train = scaler.inverse_transform(trainScore) print('\n Unscaled Train Score: %.2f MSE (%.2f RMSE)' % (unscaled_Train, math.sqrt(unscaled_Train))) testScore = testScore[0].reshape(-1, 1).astype('float32') unscaled_Test = scaler.inverse_transform(testScore) print(' Unscaled Test Score: %.2f MSE (%.2f RMSE) \n' % (unscaled_Test, math.sqrt(unscaled_Test))) p = model.predict(X_test) predic = np.squeeze(np.asarray(p)) #para transformar uma matriz de uma coluna e n linhas em um np array de n elementos print_series_prediction(y_test,predic) print('') print_series_prediction(y_test,predic, normalizer=scaler)
def mnist_utilizando_mlp(): (X_train, y_train), (X_test, y_test) = load_mnist_dataset('mnist.npz') # transformar a matriz 28*28 das imagens num vector com 784 atributos para cada imagem (porque é multilayer-perceptron) num_pixels = X_train.shape[1] * X_train.shape[2] X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32') X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32') # normalizar os valores dos pixeis de 0-255 para 0-1 X_train = X_train / 255 X_test = X_test / 255 # transformar o label que é um inteiro em categorias binárias, o valor passa a ser o correspondente à posição # o 5 passa a ser a lista [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test) num_classes = y_test.shape[1] # definir a topologia da rede e compilar model = create_compile_model_mlp(num_pixels, num_classes) utils.print_model(model, "model.png") # treinar a rede history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=50, verbose=2) #utils.print_history_accuracy(history) utils.print_history_loss(history) # Avaliação final com os casos de teste scores = model.evaluate(X_test, y_test, verbose=0) print('Scores: ', scores) print("Erro modelo MLP: %.2f%%" % (100 - scores[1] * 100))
def debug(node): uci_model = client.get_uci_config() node_model = uci_model.find_child(node) return "<pre>%s</pre>" % websafe(print_model(node_model))
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0, logger=None, save_one_ckpt=True): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = .25 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 3 grad_clip = .25 dset = train_loader.dataset utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt if logger is None: logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) model_path = os.path.join(output, 'model_epoch-1.pth') for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 train_zcore = 0 total_norm = 0 count_norm = 0 n_answer_type = torch.zeros(len(dset.idx2type)) score_answer_type = torch.zeros(len(dset.idx2type)) t = time.time() N = len(train_loader.dataset) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) for i, (v, b, q, a, c, at) in enumerate(train_loader): v = v.cuda() b = b.cuda() q = q.cuda() a = a.cuda() c = c.cuda().unsqueeze(-1).float() at = at.cuda() answer_type = torch.zeros(v.size(0), len(dset.idx2type)).cuda() answer_type.scatter_(1, at.unsqueeze(1), 1) pred, conf, att = model(v, b, q, a, c) loss = instance_bce_with_logits(pred, a) loss.backward(retain_graph=True) losz = instance_bce_with_logits(conf, c) losz.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() batch_score = compute_score_with_logits(pred, a.data) type_score = batch_score.sum(-1, keepdim=True) * answer_type batch_score = batch_score.sum() total_loss += loss.item() * v.size(0) train_score += batch_score.item() batch_zcore = compute_zcore_with_logits(conf, c.data).sum() train_zcore += batch_zcore.item() n_answer_type += answer_type.sum(0).cpu() score_answer_type += type_score.sum(0).cpu() total_loss /= N train_score = 100 * train_score / N train_zcore = 100 * train_zcore / N if None != eval_loader: model.train(False) eval_score, eval_zcore, bound, entropy, val_n_answer_type, val_score_answer_type = evaluate( model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write( '\ttrain_loss: %.2f, norm: %.4f, score: %.2f, confidence: %.2f' % (total_loss, total_norm / count_norm, train_score, train_zcore)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) logger.write('\tconfidence: %.2f (%.2f)' % (100 * eval_zcore, 100)) if eval_loader is not None and entropy is not None: info = '' for i in range(entropy.size(0)): info = info + ' %.2f' % entropy[i] logger.write('\tentropy: ' + info) if (eval_loader is not None and eval_score > best_eval_score) or ( eval_loader is None and epoch >= saving_epoch): if save_one_ckpt and os.path.exists(model_path): os.remove(model_path) model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim) best_type = val_score_answer_type if eval_loader is not None: best_eval_score = eval_score return best_eval_score, bound, n_answer_type, val_n_answer_type, score_answer_type / n_answer_type, best_type / val_n_answer_type
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = 1 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) woman = 0 woman_true = 0 woman_man = 0 woman_other = 0 man = 0 man_true = 0 man_woman = 0 man_other = 0 for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 t = time.time() N = len(train_loader.dataset) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] = 1e-3 logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) import pickle as pkl from PIL import Image, ImageDraw lab2ans = pkl.load(open("./data/cache/trainval_label2ans.pkl", 'rb')) ''' for i, (v, b, q, a,ques,im,g,gender) in enumerate(train_loader): v = v.cuda() b = b.cuda() q = q.cuda() a = a.cuda() visual_pred, att = model(v, b, q, a) loss = instance_bce_with_logits(visual_pred, a) loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() batch_score = compute_score_with_logits(visual_pred, a.data).sum() total_loss += loss.item() * v.size(0) train_score += batch_score.item() ''' total_loss /= N train_score = 100 * train_score / N if None != eval_loader: model.train(False) eval_score, bound, _ = evaluate(model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if (eval_loader is not None and eval_score > best_eval_score) or ( eval_loader is None and epoch >= saving_epoch): model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = 1 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 t = time.time() N = len(train_loader.dataset) print(N) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] = 1e-3 logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) ''' for i, (v, b, q, a,ques,im,g,gender) in enumerate(train_loader): v = v.cuda() b = b.cuda() q = q.cuda() a = a.cuda() visual_pred, att = model(v, b, q, a) #import pdb;pdb.set_trace() gender=gender.squeeze(1) weights=torch.Tensor([2.0,1.0,0.0001]).cuda() #loss = instance_bce_with_logits(visual_pred, g.cuda()) loss=nn.CrossEntropyLoss(weights) loss=loss(visual_pred,gender.cuda()) #import pdb;pdb.set_trace() loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() batch_score=torch.eq(visual_pred.argmax(1),gender.cuda()).sum() #batch_score = compute_score_with_logits(visual_pred, g.cuda()).sum() #total_loss += loss.item() * v.size(0) train_score += batch_score.item() #train_score+=batch_score ''' total_loss /= N train_score = 100 * train_score / N if None != eval_loader: model.train(False) eval_score, bound, _ = evaluate(model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim)
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = 0.01 lr_decay_epochs = range(16,50,lr_decay_step) if eval_loader is not None else range(10,20,lr_decay_step) gradual_warmup_steps = [0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 train_score_vqa=0 total_norm = 0 count_norm = 0 total_fair_loss=0 total_dis_loss=0 woman=0 woman_o=0 man=0 man_o=0 other=0 other_o=0 t = time.time() N = len(train_loader.dataset) print(N) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] =optim.param_groups[0]['lr']*lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) for name,subnet in model.named_children(): if name=='w_emb' or name=='q_emb' or name=='q_att' or name=='v_att' or name=='v_net' or name=='q_net' or name=='classifier2': print(name) for param in subnet.parameters(): param.requires_grad=False for i, (v, b, q, a,ques,im,g,gender) in enumerate(train_loader): v = v.cuda() b = b.cuda() q = q.cuda() a = a.cuda() visual_pred, vqa_pred,att = model(v, b, q, a) #import pdb;pdb.set_trace() gender=gender.squeeze(1) weights=torch.Tensor([2.0,1.0,0.001]).cuda() vqa_loss = instance_bce_with_logits(vqa_pred, a) loss=nn.CrossEntropyLoss(weights) loss=loss(visual_pred,gender.cuda()) #dis_loss=torch.abs(visual_pred[:,0]-visual_pred[:,1]).mean() #dis_loss=dis_loss.cuda() if epoch < 12: t_loss=vqa_loss else: t_loss=loss+vqa_loss t_loss.backward() #import pdb;pdb.set_trace() #vp=visual_pred[:,:2].cuda() #g=g[:,:2] #crossloss=instance_bce_with_logits(vp,g.cuda()) #mseloss=torch.nn.functional.mse_loss(vp.softmax(1),g.cuda()) #g_swap=g[:,[1,0]].cuda() #swap_loss=(vp.softmax(1)*g_swap).sum(1) #swap_loss=swap_loss.sum() for j in range(len(v)): if gender[j]==0: woman=woman+1 #if visual_pred[j].argmax()==0 or visual_pred[j].argmax()==1: if visual_pred[j].argmax()==gender[j].cuda(): woman_o=woman_o+1 elif gender[j]==1: #if visual_pred[j].argmax()==0 or visual_pred[j].argmax()==1: man=man+1 if visual_pred[j].argmax()==gender[j].cuda(): man_o=man_o+1 else: other=other+1 if visual_pred[j].argmax()==gender[j].cuda(): other_o=other_o+1 total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() #total_fair_loss+=soft_fair_loss #total_dis_loss+=dis_loss batch_score=torch.eq(visual_pred.argmax(1),gender.cuda()).sum() batch_score_vqa = compute_score_with_logits(vqa_pred, a.data).sum() #batch_score = compute_score_with_logits(visual_pred, g.cuda()).sum() #total_loss += loss.item() * v.size(0) train_score += batch_score.item() train_score_vqa+=batch_score_vqa.item() #train_score+=batch_score if i==0: print(loss) #print(10*soft_fair_loss) print("\n\n") total_loss /= N train_score = 100 * train_score / N train_score_vqa = 100 * train_score_vqa / N print("epoch",epoch) woman_score=float(woman_o)/woman man_score=float(man_o)/man other_score=float(other_o)/other print("woman",woman) print("man",man) print("other",other) print("train_woman_score",woman_score*100) print("train_man_score",man_score*100) print("train_other_score",other_score*100) print("vqa",train_score_vqa) if None != eval_loader: model.train(False) eval_score, bound, _ = evaluate(model, eval_loader) model.train(True) #print("total_fair_loss",total_fair_loss) #print("totla_dis_loss",total_dis_loss) logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score)) #logger.write('\total_fair_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, total_fair_loss)) logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim)
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = .25 lr_decay_epochs = range(10,20,lr_decay_step) if eval_loader is not None else range(10,20,lr_decay_step) gradual_warmup_steps = [0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 t = time.time() N = len(train_loader.dataset) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) for i, (v, b, q, a) in enumerate(train_loader): v = Variable(v).cuda() b = Variable(b).cuda() q = Variable(q).cuda() a = Variable(a).cuda() pred, att = model(v, b, q, a) loss = instance_bce_with_logits(pred, a) loss.backward() total_norm += nn.utils.clip_grad_norm(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() batch_score = compute_score_with_logits(pred, a.data).sum() total_loss += loss.data[0] * v.size(0) train_score += batch_score total_loss /= N train_score = 100 * train_score / N if None != eval_loader: model.train(False) eval_score, bound, entropy = evaluate(model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if eval_loader is not None and entropy is not None: info = '' for i in range(entropy.size(0)): info = info + ' %.2f' % entropy[i] logger.write('\tentropy: ' + info) if (eval_loader is not None and eval_score > best_eval_score) or (eval_loader is None and epoch >= saving_epoch): model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def train(model, train_loader, eval_loader, args, device=torch.device("cuda")): N = len(train_loader.dataset) lr_default = args.base_lr num_epochs = args.epochs lr_decay_epochs = range(args.lr_decay_start, num_epochs, args.lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default, betas=(0.9, 0.999), eps=1e-8, weight_decay=args.weight_decay) logger = utils.Logger(os.path.join(args.output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f,' % (lr_default, args.lr_decay_step, args.lr_decay_rate) + 'grad_clip=%.2f' % args.grad_clip) logger.write('LR decay epochs: ' + ','.join([str(i) for i in lr_decay_epochs])) last_eval_score, eval_score = 0, 0 relation_type = train_loader.dataset.relation_type for epoch in range(0, num_epochs): pbar = tqdm(total=len(train_loader)) total_norm, count_norm = 0, 0 total_loss, train_score = 0, 0 count, average_loss, att_entropy = 0, 0, 0 t = time.time() if epoch < len(gradual_warmup_steps): for i in range(len(optim.param_groups)): optim.param_groups[i]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[-1]['lr']) elif (epoch in lr_decay_epochs or eval_score < last_eval_score and args.lr_decay_based_on_val): for i in range(len(optim.param_groups)): optim.param_groups[i]['lr'] *= args.lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[-1]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[-1]['lr']) last_eval_score = eval_score mini_batch_count = 0 batch_multiplier = args.grad_accu_steps for i, (v, norm_bb, q, q_target, target, _, _, bb, spa_adj_matrix, sem_adj_matrix) in enumerate(train_loader): batch_size = v.size(0) num_objects = v.size(1) if mini_batch_count == 0: optim.step() optim.zero_grad() mini_batch_count = batch_multiplier ### Debugging ### # with autograd.detect_anomaly(): v = Variable(v).to(device) norm_bb = Variable(norm_bb).to(device) q = Variable(q).to(device) q_target = Variable(q_target).to(device) target = Variable(target).to(device) pos_emb, sem_adj_matrix, spa_adj_matrix = prepare_graph_variables( relation_type, bb, sem_adj_matrix, spa_adj_matrix, num_objects, args.nongt_dim, args.imp_pos_emb_dim, args.spa_label_num, args.sem_label_num, device) q_type, pred, att = model(v, norm_bb, q, pos_emb, sem_adj_matrix, spa_adj_matrix, target) loss = instance_bce_with_logits( pred, target) + instance_bce_with_logits(q_type, q_target) loss /= batch_multiplier loss.backward() mini_batch_count -= 1 total_norm += nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) count_norm += 1 batch_score = compute_score_with_logits(pred, target, device).sum() total_loss += loss.data.item() * batch_multiplier * v.size(0) train_score += batch_score pbar.update(1) if args.log_interval > 0: average_loss += loss.data.item() * batch_multiplier if model.module.fusion == "ban": current_att_entropy = torch.sum(calc_entropy(att.data)) att_entropy += current_att_entropy / batch_size / att.size( 1) count += 1 if i % args.log_interval == 0: att_entropy /= count average_loss /= count print( "step {} / {} (epoch {}), ave_loss {:.3f},".format( i, len(train_loader), epoch, average_loss), "att_entropy {:.3f}".format(att_entropy)) average_loss = 0 count = 0 att_entropy = 0 total_loss /= N train_score = 100 * train_score / N if eval_loader is not None: eval_score, bound, entropy = evaluate(model, eval_loader, device, args) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if entropy is not None: info = '' for i in range(entropy.size(0)): info = info + ' %.2f' % entropy[i] logger.write('\tentropy: ' + info) if (eval_loader is not None)\ or (eval_loader is None and epoch >= args.saving_epoch): logger.write("saving current model weights to folder") model_path = os.path.join(args.output, 'model_%d.pth' % epoch) opt = optim if args.save_optim else None utils.save_model(model_path, model, epoch, opt)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) else: model = models.__dict__[args.arch](num_classes=num_classes) print("Geometric LR: {}".format(args.geo_lr)) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() param_lr = GradientRatioScheduler.get_params_base_lr(model, args.lr) optimizer = optim.SGD(param_lr, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = GradientRatioScheduler(optimizer) print_model(model) input("Cont?") # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Epoch', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Time', 'Learning Rate' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(scheduler, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, max(scheduler.get_lr()))) st = time.time() train_loss, train_acc = train(trainloader, model, criterion, optimizer, scheduler, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) # append logger file logger.append([ epoch, train_loss, test_loss, train_acc, test_acc, time.time() - st, scheduler.get_lr() ]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) if args.save_checkpoint_model: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
def train(model, train_loader, eval_loader,output): lr_default=0.01 grad_clip = .25 epoch=0 i_iter=0 max_iter=45071 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) scheduler = get_optim_scheduler(optim) logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) while i_iter<max_iter: total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 epoch=epoch+1 N = len(train_loader.dataset) logger.write('lr: %.4f' % optim.param_groups[0]['lr']) t=time.time() for i, (v, b, q, a) in enumerate(train_loader): i_iter=i_iter+1 if i_iter>max_iter: break scheduler.step(i_iter) optim.zero_grad() v = Variable(v).cuda() b = Variable(b).cuda() q = Variable(q).cuda() a = Variable(a).cuda() pred= model(v, b, q, a) loss = instance_bce_with_logits(pred, a) loss.backward() total_norm += nn.utils.clip_grad_norm(model.parameters(), grad_clip) count_norm += 1 batch_score = compute_score_with_logits(pred, a.data).sum() total_loss += loss.data[0] * v.size(0) train_score += batch_score #print('batch_score: %.2f' % (batch_score)) #print(train_score) optim.step() total_loss /= N train_score = 100 * train_score / N if None != eval_loader: model.train(False) eval_score = evaluate(model, eval_loader) model.train(True) logger.write('epoch: %d, time: %.2f' % (epoch, time.time()-t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f' % (100 * eval_score)) if (eval_loader is not None and eval_score > best_eval_score): model_path = os.path.join(output, 'model_epoch%d.pth' % (epoch)) utils.save_model(model_path, model, iter, optim) if eval_loader is not None: best_eval_score = eval_score
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = .25 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 t = time.time() N = 0 if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) for i, (v, b, p, e, n, a, idx, types) in enumerate(train_loader): v = v.cuda() b = b.cuda() p = p.cuda() e = e.cuda() a = a.cuda() _, logits = model(v, b, p, e, a) n_obj = logits.size(2) logits.squeeze_() merged_logit = torch.cat( tuple(logits[j, :, :n[j][0]] for j in range(n.size(0))), -1).permute(1, 0) merged_a = torch.cat( tuple(a[j, :n[j][0], :n_obj] for j in range(n.size(0))), 0) loss = instance_bce_with_logits(merged_logit, merged_a, 'sum') / v.size(0) N += n.sum().float() batch_score = compute_score_with_logits(merged_logit, merged_a.data).sum() loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() total_loss += loss.item() * v.size(0) train_score += batch_score.item() total_loss /= N train_score = 100 * train_score / N if None != eval_loader: model.train(False) eval_score, bound, entropy = evaluate(model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f/%.2f/%.2f (%.2f)' % (100 * eval_score[0], 100 * eval_score[1], 100 * eval_score[2], 100 * bound)) eval_score = eval_score[0] if eval_loader is not None and entropy is not None: info = '' for i in range(entropy.size(0)): info = info + ' %.2f' % entropy[i] logger.write('\tentropy: ' + info) if (eval_loader is not None and eval_score > best_eval_score) or ( eval_loader is None and epoch >= saving_epoch): model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def train6(model, train_loader, eval_loader, num_epochs, output,s_epoch=0): lr_default=0.001 grad_clip = .25 utils.create_dir(output) lr_decay_step = 2 lr_decay_rate = .5 lr_decay_epochs = range(9, 12, lr_decay_step) gradual_warmup_steps = [0.5 * lr_default, 1.0 * lr_default,2.0*lr_default] optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 t = time.time() N = len(train_loader.dataset) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0]['lr'] *= lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) for i, (v, b, q, a,image_id) in enumerate(train_loader): v = Variable(v).cuda() b = Variable(b).cuda() q = Variable(q).cuda() a = Variable(a).cuda() pred= model(v, b, q, a) loss = instance_bce_with_logits(pred, a) loss.backward() total_norm += nn.utils.clip_grad_norm(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() batch_score = compute_score_with_logits(pred, a.data).sum() total_loss += loss.data[0] * v.size(0) train_score += batch_score total_loss /= N train_score = 100 * train_score / N if None != eval_loader: model.train(False) eval_score,eval_loss= evaluate3(model, eval_loader) model.train(True) logger.write('epoch %d, time: %.2f' % (epoch, time.time()-t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f,eval loss:%.2f' % (100 * eval_score,eval_loss)) if (eval_loader is not None and eval_score > best_eval_score) or (eval_loader is None and epoch>=0): model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def run_trainer(data_loader: dict, model: models, optimizer: optim, lr_scheduler: optim.lr_scheduler, criterion: nn, train_epochs: int, log_training_progress_every: int, log_val_progress_every: int, checkpoint_every: int, tb_summaries_dir: str, chkpt_dir: str, resume_from: str, to_device: object, to_cpu: object, attackers: object = None, train_adv_periodic_ops: int = None, *args, **kwargs): def mk_lr_step(loss): lr_scheduler.step(loss) def train_step(engine, batch): model.train() optimizer.zero_grad() x, y = map(lambda _: to_device(_), batch) if (train_adv_periodic_ops is not None) and ( engine.state.iteration % train_adv_periodic_ops == 0): random_attacker = random.choice(list(attackers)) x = attackers[random_attacker].perturb(x, y) y_pred = model(x) loss = criterion(y_pred, y) loss.backward() optimizer.step() return loss.item() def eval_step(engine, batch): model.eval() with torch.no_grad(): x, y = map(lambda _: to_device(_), batch) if random.choice(range(2)) % 2 == 0: random_attacker = random.choice(list(attackers)) x = attackers[random_attacker].perturb(x, y) y_pred = model(x) return y_pred, y def chkpt_score_func(engine): val_eval.run(data_loader['val']) y_pred, y = val_eval.state.output loss = criterion(y_pred, y) return np.mean(to_cpu(loss, convert_to_np=True)) # set up ignite engines trainer = Engine(train_step) train_eval = Engine(eval_step) val_eval = Engine(eval_step) @trainer.on(Events.ITERATION_COMPLETED(every=log_training_progress_every)) def log_training_results(engine): step = True run_type = 'train' train_eval.run(data_loader['train']) y_pred, y = train_eval.state.output loss = criterion(y_pred, y) log_results(to_cpu(y_pred, convert_to_np=True), to_cpu(y, convert_to_np=True), to_cpu(loss, convert_to_np=True), run_type, step, engine.state.iteration, total_train_steps, writer) @trainer.on(Events.ITERATION_COMPLETED(every=log_val_progress_every)) def log_val_results(engine): step = True run_type = 'val' val_eval.run(data_loader['val']) y_pred, y = val_eval.state.output loss = criterion(y_pred, y) mk_lr_step(loss) log_results(to_cpu(y_pred, convert_to_np=True), to_cpu(y, convert_to_np=True), to_cpu(loss, convert_to_np=True), run_type, step, engine.state.iteration, total_train_steps, writer) # set up vars total_train_steps = len(data_loader['train']) * train_epochs # reporter to identify memory usage # bottlenecks throughout network reporter = MemReporter() print_model(model, reporter) # set up tensorboard summary writer writer = create_summary_writer(model, data_loader['train'], tb_summaries_dir) # move model to device model = to_device(model) # set up progress bar RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') pbar = ProgressBar(persist=True, bar_format="") pbar.attach(trainer, ['loss']) # set up checkpoint objects_to_checkpoint = { 'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler } training_checkpoint = Checkpoint(to_save=objects_to_checkpoint, save_handler=DiskSaver( chkpt_dir, require_empty=False), n_saved=3, filename_prefix='best', score_function=chkpt_score_func, score_name='val_loss') # register events trainer.add_event_handler( Events.ITERATION_COMPLETED(every=checkpoint_every), training_checkpoint) # if resuming if resume_from and os.path.exists(resume_from): print(f'resume model from: {resume_from}') checkpoint = torch.load(resume_from) Checkpoint.load_objects(to_load=objects_to_checkpoint, checkpoint=checkpoint) # fire training engine trainer.run(data_loader['train'], max_epochs=train_epochs)
def main(_): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) if FLAGS.job_name == "ps": ps_config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.00001)) # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index, config=ps_config) server.join() elif FLAGS.job_name == "worker": # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index) local_worker_device = "/job:worker/task:%d" % FLAGS.task_index with tf.device( tf.train.replica_device_setter( ps_device='/job:ps/cpu:0', worker_device=local_worker_device, cluster=cluster)): if FLAGS.network == 'lstm': from models.lstm import KitModel elif FLAGS.network == 'gru': from models.gru import KitModel elif FLAGS.network == 'fc': from models.fullyconnect import KitModel elif FLAGS.network == 'alexnet': from models.alexnet import KitModel elif FLAGS.network == 'vgg16': from models.vgg16 import KitModel elif FLAGS.network == 'vgg19' or FLAGS.network == 'vgg_e': from models.vgg19 import KitModel elif FLAGS.network == 'inception_v3': from models.inception_v3 import KitModel elif FLAGS.network == 'resnet': from models.resnet import KitModel elif FLAGS.network == 'seq2seq': import models.translate.translate from models.translate.translate import dist_train dist_train(FLAGS, server, cluster) sys.exit() else: sys.exit("Invalid network [%s]" % args.network) this_model = KitModel(FLAGS) this_model.build_model() train_dir = tempfile.mkdtemp() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ], graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L1)), gpu_options=tf.GPUOptions(visible_device_list="")) if FLAGS.infer_shapes == True: sess_config.graph_options.infer_shapes = FLAGS.infer_shapes sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir=train_dir, init_op=tf.global_variables_initializer(), global_step=this_model.global_step, summary_writer=None, saver=None) if FLAGS.task_index == 0: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config, start_standard_services=True) print_model() print("Start warmup %d epoch." % FLAGS.warmup) for _ in range(FLAGS.warmup): this_model.get_data() sess.run(this_model.train_op, feed_dict=this_model.get_feed_dict()) current_step = 0 duration = 0 while current_step < FLAGS.epoch: current_step += 1 this_model.get_data() print("Start step %d" % current_step) start_time = time.time() _, step_loss = sess.run([this_model.train_op, this_model.cost], feed_dict=this_model.get_feed_dict()) end_time = time.time() print( "Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds" % (current_step, step_loss, FLAGS.batch_size / (end_time - start_time), end_time - start_time)) duration += end_time - start_time print("Total Time = %f s." % duration) #writer.close() else: sys.exit("Invalid job role name [%s]!" % args.job_name)
def train(model, train_loader, eval_loader, opt): utils.create_dir(opt.output) optim = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=opt.weight_decay) logger = utils.Logger(os.path.join(opt.output, 'log.txt')) utils.print_model(model, logger) for param_group in optim.param_groups: param_group['lr'] = opt.learning_rate scheduler = MultiStepLR(optim, milestones=[100], gamma=0.8) scheduler.last_epoch = opt.s_epoch best_eval_score = 0 for epoch in range(opt.s_epoch, opt.num_epochs): total_loss = 0 total_norm = 0 count_norm = 0 train_score = 0 t = time.time() N = len(train_loader.dataset) scheduler.step() for i, (v, b, a, _, qa_text, _, _, q_t, bias) in enumerate(train_loader): v = v.cuda() b = b.cuda() a = a.cuda() bias = bias.cuda() qa_text = qa_text.cuda() rand_index = random.sample(range(0, opt.train_candi_ans_num), opt.train_candi_ans_num) qa_text = qa_text[:,rand_index,:] a = a[:,rand_index] bias = bias[:,rand_index] if opt.lp == 0: logits = model(qa_text, v, b, epoch, 'train') loss = instance_bce_with_logits(logits, a, reduction='mean') elif opt.lp == 1: logits = model(qa_text, v, b, epoch, 'train') loss_pos = instance_bce_with_logits(logits, a, reduction='mean') index = random.sample(range(0, v.shape[0]), v.shape[0]) v_neg = v[index] b_neg = b[index] logits_neg = model(qa_text, v_neg, b_neg, epoch, 'train') self_loss = compute_self_loss(logits_neg, a) loss = loss_pos + opt.self_loss_weight * self_loss elif opt.lp == 2: logits, loss = model(qa_text, v, b, epoch, 'train', bias, a) else: assert 1==2 loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) count_norm += 1 optim.step() optim.zero_grad() score = compute_score_with_logits(logits, a.data).sum() train_score += score.item() total_loss += loss.item() * v.size(0) if i != 0 and i % 100 == 0: print( 'training: %d/%d, train_loss: %.6f, train_acc: %.6f' % (i, len(train_loader), total_loss / (i * v.size(0)), 100 * train_score / (i * v.size(0)))) total_loss /= N if None != eval_loader: model.train(False) eval_score, bound = evaluate(model, eval_loader, opt) model.train(True) logger.write('\nlr: %.7f' % optim.param_groups[0]['lr']) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write( '\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if (eval_loader is not None and eval_score > best_eval_score): if opt.lp == 0: model_path = os.path.join(opt.output, 'SAR_top'+str(opt.train_candi_ans_num)+'_best_model.pth') elif opt.lp == 1: model_path = os.path.join(opt.output, 'SAR_SSL_top'+str(opt.train_candi_ans_num)+'_best_model.pth') elif opt.lp == 2: model_path = os.path.join(opt.output, 'SAR_LMH_top'+str(opt.train_candi_ans_num)+'_best_model.pth') utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
args.relation_type, adaptive=args.adaptive, pos_emb_dim=args.imp_pos_emb_dim, dataroot=args.data_folder) train_dset = VQAFeatureDataset('train', dictionary, args.relation_type, adaptive=args.adaptive, pos_emb_dim=args.imp_pos_emb_dim, dataroot=args.data_folder) # 5. Initialize ReGAT_all print("[LOG] 5. Initializing ReGAT_all...") model = build_regat_all(val_dset, args).to(device) logger = utils.Logger(os.path.join(args.output, 'model_all_log.txt')) utils.print_model(model, logger) # 6. tfidf # Takes around 4 minutes print("[LOG] 6. tfidf_from_questions...") tfidf = None weights = None if args.tfidf: tfidf, weights = tfidf_from_questions(['train', 'val', 'test2015'], dictionary) # 7. Initialize word embeddings print("[LOG] 7. Initializing word embeddings...") model.w_emb.init_embedding( join(args.data_folder, 'glove/glove6b_init_300d.npy'), tfidf, weights)
def train(model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): lr_default = 1e-3 if eval_loader is not None else 7e-4 lr_decay_step = 2 lr_decay_rate = 0.25 lr_decay_epochs = range( 10, 20, lr_decay_step) if eval_loader is not None else range( 10, 20, lr_decay_step) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default ] saving_epoch = 3 grad_clip = .25 utils.create_dir(output) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) \ if opt is None else opt logger = utils.Logger(os.path.join(output, 'log.txt')) best_eval_score = 0 utils.print_model(model, logger) logger.write('optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f' % \ (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) import pickle as pkl lab2ans = pkl.load(open("./data/cache/trainval_label2ans.pkl", 'rb')) woman_answer_words = [ 'woman', 'women', 'female', 'girl', 'lady', 'she', 'her', 'hers', 'ladies', 'girls' ] man_answer_words = [ 'man', 'men', 'male', 'boy', 'he', 'his', 'gentleman', 'gentlemen', 'boys' ] for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 train_score_vqa = 0 total_norm = 0 count_norm = 0 total_fair_loss = 0 total_dis_loss = 0 woman = 0 woman_true = 0 man = 0 woman_man = 0 man_woman = 0 man_true = 0 other = 0.0001 other_o = 0 t = time.time() N = len(train_loader.dataset) print(N) if epoch < len(gradual_warmup_steps): optim.param_groups[0]['lr'] = gradual_warmup_steps[epoch] logger.write('gradual warmup lr: %.4f' % optim.param_groups[0]['lr']) elif epoch in lr_decay_epochs: optim.param_groups[0][ 'lr'] = optim.param_groups[0]['lr'] * lr_decay_rate logger.write('decreased lr: %.4f' % optim.param_groups[0]['lr']) else: logger.write('lr: %.4f' % optim.param_groups[0]['lr']) for i, (v, b, q, a, ques, im, g, gender) in enumerate(train_loader): v = v.cuda() b = b.cuda() q = q.cuda() a = a.cuda() visual_pred, vqa_pred, att = model(v, b, q, a) #import pdb;pdb.set_trace() gender = gender.squeeze(1) weights = torch.Tensor([2.0, 1.0, 0.001]).cuda() vqa_loss = instance_bce_with_logits(vqa_pred, a) loss = 0 #loss=nn.CrossEntropyLoss(weights) #loss=loss(visual_pred,gender.cuda()) #dis_loss=torch.abs(visual_pred[:,0]-visual_pred[:,1]).mean() #dis_loss=dis_loss.cuda() if epoch < 30: t_loss = vqa_loss else: t_loss = loss + vqa_loss t_loss.backward() #import pdb;pdb.set_trace() #vp=visual_pred[:,:2].cuda() #g=g[:,:2] #crossloss=instance_bce_with_logits(vp,g.cuda()) #mseloss=torch.nn.functional.mse_loss(vp.softmax(1),g.cuda()) #g_swap=g[:,[1,0]].cuda() #swap_loss=(vp.softmax(1)*g_swap).sum(1) #swap_loss=swap_loss.sum() #import pdb;pdb.set_trace() for j in range(len(v)): if gender[j] == 0: woman = woman + 1 check = 0 for woman_answer in woman_answer_words: if lab2ans[int(vqa_pred[j].argmax())] == woman_answer: check = 1 if check == 1: woman_true = woman_true + 1 check = 0 for man_answer in man_answer_words: if lab2ans[int(vqa_pred[j].argmax())] == man_answer: check = 1 if check == 1: woman_man = woman_man + 1 check = 0 if gender[j] == 1: man = man + 1 check = 0 for man_answer in man_answer_words: if lab2ans[int(vqa_pred[j].argmax())] == man_answer: check = 1 if check == 1: man_true = man_true + 1 check = 0 for woman_answer in woman_answer_words: if lab2ans[int(vqa_pred[j].argmax())] == woman_answer: check = 1 if check == 1: man_woman = man_woman + 1 check = 0 total_norm += nn.utils.clip_grad_norm_(model.parameters(), grad_clip) count_norm += 1 optim.step() optim.zero_grad() #total_fair_loss+=soft_fair_loss #total_dis_loss+=dis_loss #batch_score=torch.eq(visual_pred.argmax(1),gender.cuda()).sum() batch_score_vqa = compute_score_with_logits(vqa_pred, a.data).sum() #batch_score = compute_score_with_logits(visual_pred, g.cuda()).sum() #total_loss += loss.item() * v.size(0) #train_score += batch_score.item() train_score_vqa += batch_score_vqa.item() #train_score+=batch_score if i == 50 or i == 100 or i == 500: print(loss) #print(10*soft_fair_loss) print("\n\n") total_loss /= N train_score = 100 * train_score / N train_score_vqa = 100 * train_score_vqa / N #import pdb;pdb.set_trace() print("epoch", epoch) woman_score = float(woman_true) / woman man_score = float(man_true) / man #other_score=float(other_o)/other print("woman", woman) print("man", man) print("other", other) print("train_woman_score", woman_score * 100) print("train_man_score", man_score * 100) #print("train_other_score",other_score*100) print("vqa", train_score_vqa) print("\n\n") if None != eval_loader: model.train(False) eval_score, bound, _ = evaluate(model, eval_loader) model.train(True) #print("total_fair_loss",total_fair_loss) #print("totla_dis_loss",total_dis_loss) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score)) #logger.write('\total_fair_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm/count_norm, total_fair_loss)) logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) model_path = os.path.join(output, 'model_epoch%d.pth' % epoch) utils.save_model(model_path, model, epoch, optim)
def train(model, train_loader, eval_loader, opt): utils.create_dir(opt.output) optim = torch.optim.Adam(model.parameters(), lr=opt.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=opt.weight_decay) logger = utils.Logger(os.path.join(opt.output, 'log.txt')) utils.print_model(model, logger) # load snapshot if opt.checkpoint_path is not None: print('loading %s' % opt.checkpoint_path) model_data = torch.load(opt.checkpoint_path) model.load_state_dict(model_data.get('model_state', model_data)) optim.load_state_dict(model_data.get('optimizer_state', model_data)) opt.s_epoch = model_data['epoch'] + 1 for param_group in optim.param_groups: param_group['lr'] = opt.learning_rate scheduler = MultiStepLR(optim, milestones=[10, 15, 20, 25, 30, 35], gamma=0.5) scheduler.last_epoch = opt.s_epoch best_eval_score = 0 for epoch in range(opt.s_epoch, opt.num_epochs): total_loss = 0 total_bce_loss = 0 self_loss = 0 total_self_loss = 0 train_score_pos = 0 train_score_neg = 0 total_norm = 0 count_norm = 0 t = time.time() N = len(train_loader.dataset) scheduler.step() for i, (v, b, q, a, _) in enumerate(train_loader): v = v.cuda() q = q.cuda() a = a.cuda() # for the labeled samples if epoch < opt.pretrain_epoches: logits_pos, _ = model(q, v, False) if opt.ml_loss: bce_loss_pos = instance_bce_with_logits(logits_pos, a, reduction='mean') else: bce_loss_pos = instance_bce(logits_pos, a) loss = bce_loss_pos else: logits_pos, logits_neg, _, _ = model(q, v, True) if opt.ml_loss: #use multi-label loss bce_loss_pos = instance_bce_with_logits(logits_pos, a, reduction='mean') else: #use cross-entropy loss bce_loss_pos = instance_bce(logits_pos, a) self_loss = compute_self_loss(logits_neg, a) loss = bce_loss_pos + opt.self_loss_weight * self_loss loss.backward() total_norm += nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) count_norm += 1 optim.step() optim.zero_grad() score_pos = compute_score_with_logits(logits_pos, a.data).sum() train_score_pos += score_pos.item() total_loss += loss.item() * v.size(0) total_bce_loss += bce_loss_pos.item() * v.size(0) if epoch < opt.pretrain_epoches: #pretrain total_self_loss = 0 train_score_neg = 0 else: #fintune score_neg = compute_score_with_logits(logits_neg, a.data).sum() total_self_loss += self_loss.item() * v.size(0) train_score_neg += score_neg.item() if i != 0 and i % 100 == 0: print( 'traing: %d/%d, train_loss: %.6f, bce_loss: %.6f, self_loss: %.6f, neg_train_acc: %.6f, pos_train_acc: %.6f' % (i, len(train_loader), total_loss / (i * v.size(0)), total_bce_loss / (i * v.size(0)), total_self_loss / (i * v.size(0)), 100 * train_score_neg / (i * v.size(0)), 100 * train_score_pos / (i * v.size(0)))) total_loss /= N total_bce_loss /= N total_self_loss /= N train_score_pos = 100 * train_score_pos / N if None != eval_loader: model.train(False) eval_score, bound, entropy = evaluate(model, eval_loader) model.train(True) logger.write('\nlr: %.7f' % optim.param_groups[0]['lr']) logger.write('epoch %d, time: %.2f' % (epoch, time.time() - t)) logger.write('\ttrain_loss: %.2f, norm: %.4f, score: %.2f' % (total_loss, total_norm / count_norm, train_score_pos)) if eval_loader is not None: logger.write('\teval score: %.2f (%.2f)' % (100 * eval_score, 100 * bound)) if eval_loader is not None and entropy is not None: info = '' + ' %.2f' % entropy logger.write('\tentropy: ' + info) if (eval_loader is not None and eval_score > best_eval_score): model_path = os.path.join(opt.output, 'best_model.pth') utils.save_model(model_path, model, epoch, optim) if eval_loader is not None: best_eval_score = eval_score
def train(args, model, train_loader, eval_loader, num_epochs, output, opt=None, s_epoch=0): device = args.device # Scheduler learning rate lr_default = args.lr lr_decay_step = 2 lr_decay_rate = 0.75 lr_decay_epochs = (range(10, 20, lr_decay_step) if eval_loader is not None else range(10, 20, lr_decay_step)) gradual_warmup_steps = [ 0.5 * lr_default, 1.0 * lr_default, 1.5 * lr_default, 2.0 * lr_default, ] saving_epoch = 15 # Start point for model saving grad_clip = args.clip_norm utils.create_dir(output) # Adamax optimizer optim = (torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=lr_default) if opt is None else opt) # Loss function criterion = torch.nn.BCEWithLogitsLoss(reduction="sum") ae_criterion = torch.nn.MSELoss() # write hyper-parameter to log file logger = utils.Logger(os.path.join(output, "log.txt")) logger.write(args.__repr__()) utils.print_model(model, logger) logger.write( "optim: adamax lr=%.4f, decay_step=%d, decay_rate=%.2f, grad_clip=%.2f" % (lr_default, lr_decay_step, lr_decay_rate, grad_clip)) # create trainer trainer = Trainer(args, model, criterion, optim, ae_criterion) update_freq = int(args.update_freq) wall_time_start = time.time() best_eval_score = 0 # Epoch passing in training phase for epoch in range(s_epoch, num_epochs): total_loss = 0 train_score = 0 total_norm = 0 count_norm = 0 num_updates = 0 t = time.time() N = len(train_loader.dataset) num_batches = int(N / args.batch_size + 1) if epoch < len(gradual_warmup_steps): trainer.optimizer.param_groups[0]["lr"] = gradual_warmup_steps[ epoch] logger.write("gradual warm up lr: %.4f" % trainer.optimizer.param_groups[0]["lr"]) elif epoch in lr_decay_epochs: trainer.optimizer.param_groups[0]["lr"] *= lr_decay_rate logger.write("decreased lr: %.4f" % trainer.optimizer.param_groups[0]["lr"]) else: logger.write("lr: %.4f" % trainer.optimizer.param_groups[0]["lr"]) # Predicting and computing score for i, (v, q, a, _, _, _) in enumerate(train_loader): if args.maml: v[0] = v[0].reshape(v[0].shape[0], 84, 84).unsqueeze(1) if args.autoencoder: v[1] = v[1].reshape(v[1].shape[0], 128, 128).unsqueeze(1) v[0] = v[0].to(device) v[1] = v[1].to(device) q = q.to(device) a = a.to(device) sample = [v, q, a] if i < num_batches - 1 and (i + 1) % update_freq > 0: trainer.train_step(sample, update_params=False) else: loss, grad_norm, batch_score = trainer.train_step( sample, update_params=True) total_norm += grad_norm count_norm += 1 total_loss += loss.item() train_score += batch_score num_updates += 1 if num_updates % int(args.print_interval / update_freq) == 0: print( "Iter: {}, Loss {:.4f}, Norm: {:.4f}, Total norm: {:.4f}, Num updates: {}, Wall time: {:.2f}, ETA: {}" .format( i + 1, total_loss / ((num_updates + 1)), grad_norm, total_norm, num_updates, time.time() - wall_time_start, utils.time_since(t, i / num_batches), )) total_loss /= num_updates train_score = 100 * train_score / (num_updates * args.batch_size) # Evaluation if eval_loader is not None: print("Evaluating...") trainer.model.train(False) eval_score, bound = evaluate(model, eval_loader, args) trainer.model.train(True) logger.write("epoch %d, time: %.2f" % (epoch, time.time() - t)) logger.write("\ttrain_loss: %.2f, norm: %.4f, score: %.2f" % (total_loss, total_norm / count_norm, train_score)) if eval_loader is not None: logger.write("\teval score: %.2f (%.2f)" % (100 * eval_score, 100 * bound)) # Save per epoch if epoch >= saving_epoch: model_path = os.path.join(output, "model_epoch%d.pth" % epoch) utils.save_model(model_path, model, epoch, trainer.optimizer) # Save best epoch if eval_loader is not None and eval_score > best_eval_score: model_path = os.path.join(output, "model_epoch_best.pth") utils.save_model(model_path, model, epoch, trainer.optimizer) best_eval_score = eval_score
def main(): parser = argparse.ArgumentParser( description='PyTorch MNIST Example', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', type=str, default='data/mnist.npy', help='path to dataset') parser.add_argument('--batch-size', type=int, default=100, metavar='N', help='input batch size for training') parser.add_argument('--epochs', type=int, default=101, metavar='N', help='number of epochs to train') parser.add_argument('--LR', type=float, default=0.01, metavar='LR', help='learning rate') parser.add_argument('--L2', type=float, default=0.0001, metavar='L2', help='L2 weight decay strength') parser.add_argument('--L1_1', type=float, default=5e-4, metavar='L2', help='L1 weight decay strength') parser.add_argument('--L1_2', type=float, default=1e-5, metavar='L2', help='L1 weight decay strength') parser.add_argument('--L3', type=float, default=0.05, metavar='L3', help='gradient decay strength') parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed') parser.add_argument('--use_bias', dest='use_bias', action='store_true', help='use biases') parser.add_argument('--q_a', type=int, default=4, metavar='S', help='quantize activations to this number of bits') parser.add_argument('--act_max', type=float, default=1.0, help='clipping threshold for activations') parser.add_argument('--w_max', type=float, default=0., help='clipping threshold for weights') parser.add_argument('--stochastic', type=float, default=0.5, help='stochastic quantization') parser.add_argument('--debug', dest='debug', action='store_true', help='debug') parser.add_argument('--calculate_running', dest='calculate_running', action='store_true', help='calculate_running') parser.add_argument('--plot', dest='plot', action='store_true', help='plot') parser.add_argument('--save', dest='save', action='store_true', help='save') parser.add_argument('--bn1', dest='bn1', action='store_true', help='bn1') parser.add_argument('--bn2', dest='bn2', action='store_true', help='bn2') parser.add_argument('--track_running_stats', dest='track_running_stats', action='store_true', help='track_running_stats') parser.add_argument('--augment', dest='augment', action='store_true', help='augment') parser.add_argument('--triple_input', dest='triple_input', action='store_true', help='triple_input') parser.add_argument('--dropout_input', type=float, default=0.2, help='dropout_input drop prob') parser.add_argument('--dropout_act', type=float, default=0.4, help='dropout_act drop prob') parser.add_argument('--prune_weights1', type=float, default=0.0, help='percentage of smallest weights to set to zero') parser.add_argument('--prune_weights2', type=float, default=0.0, help='percentage of smallest weights to set to zero') parser.add_argument('--prune_epoch', type=float, default=90, help='do pruning at the end of this epoch') parser.add_argument('--var_name', type=str, default='', help='var_name') parser.add_argument('--gpu', type=str, default=None, help='gpu') parser.add_argument('--num_sims', type=int, default=1, help='number of simulation runs') args = parser.parse_args() if args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu np.set_printoptions(precision=4, linewidth=200, suppress=True) data = np.load(args.dataset, allow_pickle=True) train_data, val_data = data train_inputs, train_labels = train_data test_inputs, test_labels = val_data train_inputs = torch.from_numpy(train_inputs).cuda() train_labels = torch.from_numpy(train_labels).cuda() test_inputs = torch.from_numpy(test_inputs).cuda() test_labels = torch.from_numpy(test_labels).cuda() results = {} if args.var_name == 'L1_1': var_list = [ 0, 1e-6, 2e-6, 3e-6, 5e-6, 7e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 7e-5, 1e-4, 2e-4 ] elif args.var_name == 'L1_2': var_list = [ 0, 1e-6, 2e-6, 3e-6, 5e-6, 7e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 7e-5, 1e-4, 2e-4 ] elif args.var_name == 'L3': var_list = [ 0, 0.001, 0.002, 0.003, 0.005, 0.007, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.1, 0.2 ] elif args.var_name == 'L2': var_list = [ 0, 5e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 7e-5, 1e-4, 2e-4, 3e-4, 4e-4, 5e-4, 0.001 ] else: var_list = [' '] total_list = [] for var in var_list: if args.var_name != '': print('\n\n********** Setting {} to {} **********\n\n'.format( args.var_name, var)) setattr(args, args.var_name, var) results[var] = [] best_accs = [] for s in range(args.num_sims): model = Net(args).cuda() optimizer = optim.SGD(model.parameters(), lr=args.LR, momentum=args.momentum, weight_decay=args.L2) num_train_batches = int(len(train_inputs) / args.batch_size) best_acc = 0 if s == 0: utils.print_model(model, args) for epoch in range(args.epochs): rnd_idx = np.random.permutation(len(train_inputs)) train_inputs = train_inputs[rnd_idx] train_labels = train_labels[rnd_idx] if epoch % 70 == 0 and epoch != 0: print('\nReducing learning rate ') for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] / 10. train_acc = train(args, model, num_train_batches, train_inputs, train_labels, optimizer) val_acc = test(model, test_inputs, test_labels) if (args.prune_weights1 > 0 or args.prune_weights2 > 0 ) and epoch % args.prune_epoch == 0 and epoch != 0: print('\n\nAccuracy before pruning: {:.2f}\n\n'.format( val_acc)) sparsities = prune_weights(args, model) val_acc = test(model, test_inputs, test_labels) print('\n\nAccuracy after pruning: {:.2f}\n\n'.format( val_acc)) else: sparsities = [ p.data[torch.abs(p.data) < 0.01 * p.data.max()].numel() / p.data.numel() * 100.0 for _, p in model.named_parameters() ] print( 'Epoch {:>2d} train acc {:>.2f} test acc {:>.2f} LR {:.4f} sparsity {:>3.1f} {:>3.1f}' .format(epoch, train_acc, val_acc, optimizer.param_groups[0]['lr'], sparsities[0], sparsities[1])) if val_acc > best_acc: best_acc = val_acc if epoch > 80 and (args.save or args.plot): sparsities = prune_weights(args, model) val_acc = test(model, test_inputs, test_labels) print('\n\nAccuracy after pruning: {:.2f}\n\n'.format( val_acc)) w_pos = model.fc1.weight.clone() w_pos[w_pos < 0] = 0 w_neg = model.fc1.weight.clone() w_neg[w_neg >= 0] = 0 pos = F.linear(model.quantized_input, w_pos) neg = F.linear(model.quantized_input, w_neg) sep1 = torch.cat((neg, pos), 0) w_pos = model.fc2.weight.clone() w_pos[w_pos < 0] = 0 w_neg = model.fc2.weight.clone() w_neg[w_neg >= 0] = 0 pos = F.linear(model.act, w_pos) neg = F.linear(model.act, w_neg) sep2 = torch.cat((neg, pos), 0) dict_names = [ 'input', 'fc1_weights', 'preact', 'diff_preact', 'act', 'fc2_weights', 'output', 'diff_output' ] tensors = [ model.quantized_input, model.fc1.weight, model.preact, sep1, model.act, model.fc2.weight, model.output, sep2 ] shapes = [list(t.shape) for t in tensors] arrays = [ t.detach().cpu().half().numpy() for t in tensors ] mlp_dict = { key: value for key, value in zip(dict_names, shapes) } if args.save: print('\n\nSaving MLP:\n{}\n'.format(mlp_dict)) # np.save('mlp.npy', arrays[1:]) # scipy.io.savemat('chip_plots/mnist_val.mat', mdict={key: value for key, value in zip(names[:], values[:])}) # scipy.io.savemat('chip_plots/mnist_labels.mat', mdict={'mnist_test_labels': test_labels.detach().cpu().numpy()}) # print('\nLabels:', test_labels.detach().cpu().numpy().shape, test_labels.detach().cpu().numpy()[:20], '\n\n') scipy.io.savemat( 'chip_plots/mlp.mat', mdict={ key: value for key, value in zip( dict_names[1:], arrays[1:]) }) # scipy.io.savemat('chip_plots/mlp_first_layer_q4_act_1_acc_.mat', mdict={dict_names[2]: arrays[2], dict_names[3]: arrays[3]}) if args.plot: names = [ 'input', 'weights', 'output', 'diff_output' ] layers = [] layer = [] print('\n\nlen(arrays) // len(names):', len(arrays), len(names), len(arrays) // len(names), '\n\n') num_layers = len(arrays) // len(names) for k in range(num_layers): print('layer', k, names) for j in range(len(names)): layer.append([arrays[len(names) * k + j]]) layers.append(layer) layer = [] info = [] neuron_inputs = [] for n, p in model.named_parameters(): if 'weight' in n: neuron_inputs.append(np.prod(p.shape[1:])) for idx in range(len(neuron_inputs)): temp = [] temp.append('{:d} neuron inputs '.format( neuron_inputs[idx])) #if args.plot_power: #temp.append('{:.2f}mW '.format(self.power[idx][0])) info.append(temp) if args.plot: print('\nPlotting {}\n'.format(names)) plot_layers(num_layers=len(layers), models=['chip_plots/'], epoch=epoch, i=0, layers=layers, names=names, var='', vars=[''], infos=info, pctl=99.9, acc=val_acc) #plot_grid([[[v] for v in values]], ['input', 'quantized_input', 'weights', 'output'], path='chip_plots/epoch_' + str(epoch), filename='_mlp_histograms.png') #layers = [[[a1, aa1], [a2, aa2]]] #raise(SystemExit) if args.plot and os.path.exists('chip_plots/mlp.mat'): os.rename( r'chip_plots/mlp.mat', r'chip_plots/mlp_act_max_{:.1f}_w_max_{:.1f}_L2_{:.4f}_L3_{:.1f}_drop_{:.2f}_{:.2f}_LR_{:.3f}_acc_{:.2f}.mat' .format(args.act_max, args.w_max, args.L2, args.L3, args.dropout_input, args.dropout_act, args.LR, best_acc)) print('\nSimulation {:d} Best Accuracy: {:.2f}\n\n'.format( s, best_acc)) best_accs.append(best_acc) total_list.append( (np.mean(best_accs), np.min(best_accs), np.max(best_accs))) print('\n{:d} runs: {} {} {:.2f} ({:.2f}/{:.2f})\n'.format( args.num_sims, args.var_name, var, *total_list[-1])) print('\n\n') for var, (mean, min, max) in zip(var_list, total_list): print('{} {:>5} acc {:.2f} ({:.2f}/{:.2f})'.format( args.var_name, var, mean, min, max)) print('\n\n')
def main(_): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) if FLAGS.job_name == "ps": ps_config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.01)) # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index, config=ps_config) server.join() elif FLAGS.job_name == "worker": # Create and start a server for the local task. server = tf.train.Server( cluster, # protocol = "grpc+verbs", job_name=FLAGS.job_name, task_index=FLAGS.task_index) ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.data_dir) ##################################### # Select the preprocessing function # ##################################### image_preprocessing_fn = preprocessing_factory.get_preprocessing( FLAGS.network, is_training=True) ###################### # Select the network # ###################### network_fn = nets_factory.get_network_fn(FLAGS.network, FLAGS.num_classes, is_training=True) if FLAGS.dataset_name != "synthetic": provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) image = image_preprocessing_fn(image, network_fn.default_image_size, network_fn.default_image_size) images, labels = tf.train.batch([image, label], batch_size=FLAGS.batch_size, num_threads=4, capacity=5 * FLAGS.batch_size) else: images = random_ops.random_uniform( (FLAGS.batch_size, network_fn.default_image_size, network_fn.default_image_size, 3), maxval=1) labels = random_ops.random_uniform((FLAGS.batch_size, ), maxval=FLAGS.num_classes - 1, dtype=tf.int32) with tf.device( tf.train.replica_device_setter( ps_device='/job:ps/cpu:0', worker_device=("/job:worker/task:%d" % FLAGS.task_index), cluster=cluster)): global_step = tf.contrib.framework.get_or_create_global_step() #images, labels = cifar.distorted_inputs(FLAGS) logits, end_points = network_fn(images) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) cost = tf.reduce_mean(loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( cost, global_step=global_step) print_model() train_dir = tempfile.mkdtemp() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=[ "/job:ps", "/job:worker/task:%d" % FLAGS.task_index ], graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L1)), gpu_options=tf.GPUOptions(visible_device_list="")) if FLAGS.infer_shapes == True: sess_config.graph_options.infer_shapes = FLAGS.infer_shapes sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), logdir=train_dir, init_op=tf.global_variables_initializer(), global_step=global_step, summary_writer=None, saver=None) if FLAGS.task_index == 0: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) sess = sv.prepare_or_wait_for_session(server.target, config=sess_config, start_standard_services=True) print("Start warmup %d epoch." % FLAGS.warmup) for _ in range(FLAGS.warmup): sess.run(train_op) options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() current_step = 0 duration = 0 while current_step < FLAGS.epoch: current_step += 1 start_time = time.time() _, step_loss = sess.run([train_op, cost], options=options, run_metadata=run_metadata) end_time = time.time() print( "Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds" % (current_step, step_loss, FLAGS.batch_size / (end_time - start_time), end_time - start_time)) duration += end_time - start_time if current_step == 3: fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(chrome_trace) print("Total Time = %f s." % duration) #writer.close() else: sys.exit("Invalid job role name [%s]!" % args.job_name)
def main(_): ps_hosts = FLAGS.ps_hosts.split(",") worker_hosts = FLAGS.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) if FLAGS.job_name == "ps": ps_config = tf.ConfigProto( gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.00001 )) # Create and start a server for the local task. server = tf.train.Server(cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index, config = ps_config) server.join() elif FLAGS.job_name == "worker": maybe_download_and_extract(FLAGS.data_dir, FLAGS.data_url) cifar.modify_flags(FLAGS) print (FLAGS.data_dir) # Create and start a server for the local task. server = tf.train.Server(cluster, # protocol = "grpc_rdma", job_name=FLAGS.job_name, task_index=FLAGS.task_index) local_worker_device = "/job:worker/task:%d" % FLAGS.task_index with tf.device(tf.train.replica_device_setter( ps_device='/job:ps/cpu:0', worker_device=local_worker_device, cluster=cluster)): if FLAGS.network == 'fc': from models.fullyconnect import KitModel elif FLAGS.network == 'cifar': from models.cifar import KitModel elif FLAGS.network == 'alexnet': from models.alexnet import KitModel elif FLAGS.network == 'vgg19' or FLAGS.network == 'vgg_e': from models.vgg19 import KitModel elif FLAGS.network == 'inception_v3' : from models.inception_v3 import KitModel elif FLAGS.network == 'resnet': from models.resnet import KitModel else: sys.exit("Invalid network [%s]" % FLAGS.network) this_model = KitModel(FLAGS) images, labels = cifar.distorted_inputs(FLAGS) logits = this_model.inference(images) loss = this_model.loss(labels) train_op = this_model.train() train_dir = tempfile.mkdtemp() sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index], graph_options=tf.GraphOptions( optimizer_options=tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L1) ), gpu_options=tf.GPUOptions( visible_device_list="" ) ) if FLAGS.infer_shapes == True: sess_config.graph_options.infer_shapes = FLAGS.infer_shapes sv = tf.train.Supervisor( is_chief = (FLAGS.task_index == 0), logdir = train_dir, init_op = tf.global_variables_initializer(), global_step = this_model.global_step, summary_writer = None, saver = None) if FLAGS.task_index == 0: print("Worker %d: Initializing session..." % FLAGS.task_index) else: print("Worker %d: Waiting for session to be initialized..." % FLAGS.task_index) sess = sv.prepare_or_wait_for_session(server.target, config = sess_config, start_standard_services = True) print_model() print ("Start warmup for %d mini-batch." % FLAGS.warmup) for _ in range(FLAGS.warmup): sess.run(this_model.train_op) current_step = 0 current_epoch = 1 duration = 0 FLAGS.epoch = FLAGS.epoch * NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size print ("Start Training for %d mini-batch." % FLAGS.epoch) while current_step < FLAGS.epoch: current_step += 1 start_time = time.time() _, step_loss = sess.run([this_model.train_op, this_model.cost]) end_time = time.time() # print("Finish step %d, loss = %f, speed = %f sampes/s, duration = %f seconds" % (current_step, step_loss, FLAGS.batch_size / (end_time - start_time), end_time - start_time)) duration += end_time - start_time print("Time: %f seconds, step_loss: %f" % (duration, step_loss)) if current_step * FLAGS.batch_size > current_epoch * NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN: print ("Finish epoch %d" % (current_epoch)) current_epoch += 1 print ("Total Time = %f s." % duration) #writer.close() else: sys.exit("Invalid job role name [%s]!" % FLAGS.job_name)