def factory(organism): from sgd import SGD from zfin import ZFin from worm import WormBase from fly import FlyBase from mouse import MGI from rat import RGD from human import Human if organism in ("Saccharomyces cerevisiae", "S. cerevisiae", "YEAST"): return SGD() elif organism in ("Danio rerio", "D. rerio", "DANRE"): return ZFin() elif organism in ("Caenorhabditis elegans", "C. elegans", "CAEEL"): return WormBase() elif organism in ("Drosophila melanogaster", "D. melanogaster", "DROME"): return FlyBase() elif organism in ("Mus musculus", "M. musculus", "MOUSE"): return MGI() elif organism in ("Rattus norvegicus", "R. norvegicus", "RAT"): return RGD() elif organism in ("H**o sapiens", "H. sapiens", "HUMAN"): return Human() else: return None
def sgd_model(row_vectors, col_vectors, eta, iterations): ''' creates a sgd model ''' model = SGD(eta, iterations) model.learn(row_vectors, col_vectors) return model
def main(): classifier_name=sys.argv[1] datapath=sys.argv[2] df=pd.read_csv(datapath) X=df.iloc[0:100,[0,2]].values y=df.iloc[0:100,4].values y=np.where(y=='Iris-setosa',1,-1) if(classifier_name=='Perceptron'): from perceptron import Perceptron model=Perceptron(eta=0.01,n_iter=10) model.learn(X,y) print('errors for this classification are:\n',model.errors) plt.plot(range(1,len(model.errors)+1), model.errors,marker='o') model.testdatairis('test.csv') print("Accuracy of Perceptron is:",model.accuracy) print("--- %s seconds ---" % (time.time() - start_time)) elif(classifier_name=='Adaline'): from adaline import Adaline model=Adaline(eta=0.01,n_iter=20) X_std = np.copy(X) X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std() X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std() model.learn(X_std,y) print('sum of errors in each iteration for this classification are:\n',model.cost) plt.plot(range(1,len(model.cost)+1), model.cost,marker='o') model.testdatairis('test.csv') print("Accuracy of Adaline is:",model.accuracy) print("--- %s seconds ---" % (time.time() - start_time)) elif(classifier_name=='SGD'): from sgd import SGD model=SGD(eta=0.01,n_iter=15) model.learn(X,y) print('sum of errors in each interation for this classification are:\n' ,model.cost) plt.plot(range(1,len(model.cost)+1), model.cost,marker='o') model.testdatairis('test.csv') print("Accuracy of SGD is:",model.accuracy) print("--- %s seconds ---" % (time.time() - start_time)) else: print("invalid classifier") return plt.title(classifier_name) plt.xlabel('iteration') plt.ylabel('errors') plt.show() return(model)
def get_layer_trainer_sgd_rbm(layer, trainset): train_algo = SGD( learning_rate = 1e-1, batch_size = 5, #"batches_per_iter" : 2000, monitoring_batches = 20, monitoring_dataset = trainset, cost = SMD(corruptor=GaussianCorruptor(stdev=0.4)), termination_criterion = EpochCounter(max_epochs=MAX_EPOCHS), # another option: # MonitorBasedTermCrit(prop_decrease=0.01, N=10), ) model = layer callbacks = [MonitorBasedLRAdjuster(), ModelSaver()] return Train(model = model, algorithm = train_algo, callbacks = callbacks, dataset = trainset)
def setUp(self): self.batch_size = 10 wordvec_size = 100 hidden_size = 100 self.time_size = 5 learning_rate = 0.1 self.max_epoch = 100 corpus, word_to_id, id_to_word = load_data('train') corpus_size = 1000 corpus = corpus[:corpus_size] vocab_size = int(max(corpus) + 1) self.xs = corpus[:-1] self.ts = corpus[1:] model = SimpleRNNLM(vocab_size, wordvec_size, hidden_size) optimiser = SGD(learning_rate) self.rnnlm_trainer = RNNLMTrainer(model, optimiser)
def get_layer_trainer_sparse_denoising_autoencoder(layer, trainset): train_algo = SGD( learning_rate = 0.1, cost = SampledMeanSquaredReconstructionError(), batch_size = 10, monitoring_batches = 10, monitoring_dataset = trainset, termination_criterion = EpochCounter(max_epochs=MAX_EPOCHS), update_callbacks = None ) model = layer callbacks = [ModelSaver()] return Train(model = model, algorithm = train_algo, callbacks = callbacks, dataset = trainset)
def test_update(self): sgd = SGD() gnn = GraphNeuralNetwork(vector_size=2) expected = gnn.params sgd.update(gnn) actual = gnn.params self.assertEqual(expected, actual) params = copy.deepcopy(gnn.params) for _ in range(100): gnn.grads["W"] = np.random.rand() gnn.grads["A"] = np.random.rand() gnn.grads["b"] = np.random.rand() sgd.update(gnn) for key, param in params.items(): params[key] = param - gnn.grads[key] * sgd.lr expected = repr(params[key]) actual = repr(gnn.params[key]) self.assertEqual(expected, actual)
network = models.Sequential() network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28, ))) network.add(layers.Dense(10, activation='softmax')) results_acc = [] result_acc = [] results_loss = [] result_loss = [] test_acc_results = [] test_loss_results = [] nonzero_weights = [] l = [ GRDA(lr=.005, c=.02), SGD(lr=.005, nesterov=False), SGD(lr=.005, nesterov=True), Adagrad(lr=.005), Adam(lr=.005, amsgrad=False), Adam(lr=.005, amsgrad=True) ] allcounts = np.sum([x.size for x in network.get_weights()]) for opt in l: network.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) #network.save_weights('initial_weights.h5') #network.load_weights('initial_weights.h5') #initial_weights = network.get_weights() result_acc = []
eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.CrossEntropyLoss() optimizer = SGD(model.parameters(), args.lr, momentum=0.9, nesterov=True) #optimizer = Adagrad(model.parameters(), args.lr) #optimizer = Adam(model.parameters(), betas=(0.9, 0.999)) #optimizer = RMSprop(model.parameters()) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h)
from layers.tanh_layer import TanhLayer np.random.seed(42) # Load and clean data x = None y = None # Split x and y into training and validation sets x_train = None y_train = None x_val = None y_val = None # Create NN nn = SGD() nn.add_layer(InputLayer(10)) nn.add_layer(SigmoidLayer(10)) nn.add_layer(LinearLayer(10)) nn.add_layer(TanhLayer(10)) nn.add_layer(SoftmaxLoglossLayer(10)) nn.initialize_weights() # Train network nn.mini_batch_learning(x_train, y_train, x_val, y_val, n_epoch=100,
data = preprocess(args.dataset, makebinary=True) if data is None: print('Dataset unrecognized.') exit() X, y = data['train'] Xs, ys = data['test'] # Based on input, call classifiers if args.classifier == 'perceptron': model = Perceptron(args.eta, args.iters) elif args.classifier == 'adaline': model = Adaline(args.eta, args.iters) elif args.classifier == 'sgd': model = SGD(args.eta, args.iters) elif args.classifier == 'ovr': model = OVR(data['classes'], args.eta, args.iters) model.fit(X, y) res = model.predict(Xs) matches = 0 if args.classifier == 'ovr': accuracy = [] res = res[1] clas = res[0] print(res) print(np.where(ys == clas, 1, -1)) for i in range(len(res)):
for key, val in tparams.iteritems(): print key # running means and running variances should not be included in the list for which gradients are computed if 'rm' in key or 'rv' in key: continue elif 'sg' in key: tparams_sg[key] = val else: tparams_net[key] = val print "Setting up optimizers" f_grad_shared, f_update = adam(lr, tparams_net, grads, inps_net, outs) # f_grad_shared_sg, f_update_sg = adam(lr, tparams_sg, grads_sg, inps_sg, loss_sg) # sgd with momentum updates sgd = SGD(lr=args.learning_rate) f_update_sg = theano.function(inps_sg, loss_sg, updates=sgd.get_grad_updates(loss_sg, param_sg), on_unused_input='ignore', profile=False) print "Training" cost_report = open('./Results/disc/SF/gradcomp_' + code_name + '_' + str(args.batch_size) + '_' + str(args.learning_rate) + '.txt', 'w') id_order = range(len(trc)) iters = 0 min_cost = 100000.0 epoch = 0 condition = False while condition == False: print "Epoch " + str(epoch + 1), np.random.shuffle(id_order) epoch_cost = 0.
import sys sys.path.append('./src') sys.path.append('./src/concerns') sys.path.append('./src/models') sys.path.append('./src/layers') sys.path.append('./src/optimisers') from sgd import SGD from trainer import Trainer from spiral import * from two_layer_net import TwoLayerNet # Hyper Parameters max_epoch = 300 batch_size = 30 hidden_size = 10 learning_rate = 1.0 x, t = load_data() model = TwoLayerNet(input_size=2, hidden_size=hidden_size, output_size=3) optimizer = SGD(lr=learning_rate) trainer = Trainer(model, optimizer) trainer.fit(x, t, max_epoch, batch_size, eval_interval=10) file_path = '../img/training_plot.png' trainer.save_plot_image(file_path)
# out = self.fc1(x) # out = F.relu(out) # out = self.fc2(out) # return out #net = Net(input_size, hidden_size, num_classes) from lenet3 import LeNet net = LeNet() net.cuda() net.train() # Loss and Optimizer criterion = nn.CrossEntropyLoss() #optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) optimizer = SGD(net.parameters(), lr=learning_rate, weight_decay=0.0001, momentum=0.9, nesterov=True) # Train the Model for epoch in range(num_epochs): train_loss_log = AverageMeter() train_acc_log = AverageMeter() val_loss_log = AverageMeter() val_acc_log = AverageMeter() for i, (images, labels) in enumerate(train_loader): # Convert torch tensor to Variable # if i>0: # break # images = Variable(images.view(-1, 28*28).cuda()) images = Variable(images.cuda())
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=4096, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--optimizer', type=str, default='lamb', choices=['lamb', 'adam', 'lars', 'sgd'], help='which optimizer to use') parser.add_argument('--test-batch-size', type=int, default=2048, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=100, metavar='N', help='number of epochs to train (default: 100)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.0025)') parser.add_argument('--wd', type=float, default=0.01, metavar='WD', help='weight decay (default: 0.01)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--eta', type=int, default=0.001, metavar='e', help='LARS coefficient (default: 0.001)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() use_cuda = torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") print(device) print(args) print("*" * 50) kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( # datasets.MNIST('../data', train=True, download=True, datasets.EMNIST('../data', train=True, download=True, split='letters', transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( # datasets.MNIST('../data', train=False, transform=transforms.Compose([ datasets.EMNIST('../data', train=False, split='letters', transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) num_features = 26 model = Net(num_outputs=num_features).to(device) writer = SummaryWriter(comment="_cv_%s_%s_%s" % (args.optimizer, args.batch_size, args.lr)) weight_decay = args.lr / args.epochs print(len(train_loader), len(test_loader)) print(model) print(f'total params ---> {count_parameters(model)}') if args.optimizer == 'lamb': optimizer = Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(.9, .999), adam=False, writer=writer) elif args.optimizer == 'lars': base_optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001, writer=writer) elif args.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd, writer=writer) else: # use adam optimizer optimizer = Lamb(model.parameters(), lr=args.lr, weight_decay=args.wd, betas=(.9, .999), adam=True, writer=writer) print(f'Currently using the {args.optimizer}\n\n') metrics = {"acc": [], "test_loss": []} os.makedirs("cv_results", exist_ok=True) for epoch in range(1, args.epochs + 1): print("Epoch #%s" % epoch) train(args, model, device, train_loader, optimizer, epoch, writer) acc, loss = test(args, model, device, test_loader, writer, epoch) metrics["acc"].append(acc) metrics["test_loss"].append(loss) pickle.dump( metrics, open( os.path.join( "cv_results", '%s_%s_metrics.p' % (args.optimizer, args.batch_size)), 'wb')) #print("Epoch #$s: acc=%s, loss=%s" % (epoch, acc, loss)) return optimizer
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='../data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='../data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.endswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, ) else: model = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() if args.opt_method == "sgd": print("using SGD") optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=False) elif args.opt_method == "adam": print("using Adam") optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise Exception("Optimizer not supported") # Resume title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join( args.checkpoint, str(args.dataset) + '_' + str(args.arch) + '_mom_lr=' + str(args.lr) + '_m=' + str(args.momentum) + '_drop:' + str(args.drop) + '_seed=' + str(args.manualSeed) + '.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return flag3 = 0 # Train and val for epoch in range(start_epoch, args.epochs): if args.opt_method == "sgd": adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) if flag3 == 0: logger.append([0, train_init_loss, test_init_loss, 0, 0]) flag3 = 1 # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) logger.close() print('Best acc:') print(best_acc)
# Load and prepare data date, latitude, longitude, magnitude = Dataset.load_from_file("database.csv") data_size = len(date) vectorsX, vectorsY = Dataset.vectorize(date, latitude, longitude), magnitude.reshape( (data_size, 1)) # Get Batcher batch_gen = Generator.gen_random_batch(batch_size, vectorsX, vectorsY) # randomly initialize our weights with mean 0 syn0 = 2 * np.random.standard_normal((vectorsX.shape[1], 32)) / 10 syn1 = 2 * np.random.standard_normal((32, vectorsY.shape[1])) / 10 # Init trainer table and datalog trainers = [SGD(syn0, syn1), Momentum(syn0, syn1), Adma(syn0, syn1)] datalog = [] # Train model x = x = np.arange(1, max_epochs) for t in x: # Get Batch batch = next(batch_gen) for trainer in trainers: syn0, syn1 = trainer.get_weight() # feed forward l0 = batch[0] l1 = Math.sigmoid(np.dot(l0, syn0)) l2 = Math.relu(np.dot(l1, syn1))
def main(local_rank, world_size, init_method='tcp://127.0.0.1:23499'): dist.init_process_group(backend='nccl', init_method=init_method, rank=local_rank, world_size=world_size) cfg.local_rank = local_rank torch.cuda.set_device(local_rank) cfg.rank = dist.get_rank() cfg.world_size = world_size print(cfg.rank, dist.get_world_size()) trainset = MXFaceDataset(root_dir='/root/face_datasets/webface/', local_rank=local_rank) train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, shuffle=True) trainloader = DataLoaderX(local_rank=local_rank, dataset=trainset, batch_size=cfg.batch_size, sampler=train_sampler, num_workers=0, pin_memory=True, drop_last=False) backbone = iresnet50(False).to(cfg.local_rank) backbone.train() # backbone = nn.SyncBatchNorm.convert_sync_batchnorm(backbone) for ps in backbone.parameters(): dist.broadcast(ps, 0) backbone = torch.nn.parallel.DistributedDataParallel( backbone, broadcast_buffers=False, device_ids=[dist.get_rank()]) backbone.train() sub_start, sub_classnum = get_sub_class(cfg.rank, dist.get_world_size()) print(sub_start, sub_classnum) classifier_head = classifier(cfg.embedding_size, sub_classnum, sample_rate=0.4) cosface = CosFace(s=64.0, m=0.4) optimizer = SGD([{ 'params': backbone.parameters() }, { 'params': classifier_head.parameters() }], 0.1, momentum=0.9, weight_decay=cfg.weight_decay, rescale=cfg.world_size) warm_up_with_multistep_lr = lambda epoch: ( (epoch + 1) / (4 + 1))**2 if epoch < -1 else 0.1**len( [m for m in [20, 29] if m - 1 <= epoch]) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=warm_up_with_multistep_lr) n_epochs = 33 start_epoch = 0 if cfg.local_rank == 0: writer = SummaryWriter(log_dir='logs/shows') global_step = 0 loss_fun = nn.CrossEntropyLoss() for epoch in range(start_epoch, n_epochs): train_sampler.set_epoch(epoch) for step, (img, label) in enumerate(trainloader): start = time.time() lable_gather, norm_weight = classifier_head.prepare( label, optimizer) x = F.normalize(backbone(img)) x_gather = torch.zeros(x.size()[0] * cfg.world_size, cfg.embedding_size, device=cfg.local_rank) dist.all_gather(list(x_gather.chunk(cfg.world_size, dim=0)), x.data) x_gather.requires_grad = True logits = classifier_head(x_gather, norm_weight) logits = cosface(logits, lable_gather) with torch.no_grad(): max_v = torch.max(logits, dim=1, keepdim=True)[0] dist.all_reduce(max_v, dist.ReduceOp.MAX) exp = torch.exp(logits - max_v) sum_exp = exp.sum(dim=1, keepdims=True) dist.all_reduce(sum_exp, dist.ReduceOp.SUM) exp.div_(sum_exp.clamp_min(1e-20)) grad = exp index = torch.where(lable_gather != -1)[0] one_hot = torch.zeros(index.size()[0], grad.size()[1], device=grad.device) one_hot.scatter_(1, lable_gather[index, None], 1) loss = torch.zeros(grad.size()[0], 1, device=grad.device) loss[index] = grad[index].gather(1, lable_gather[index, None]) dist.all_reduce(loss, dist.ReduceOp.SUM) loss_v = loss.clamp_min_(1e-20).log_().mean() * (-1) grad[index] -= one_hot grad.div_(grad.size()[0]) logits.backward(grad) if x_gather.grad is not None: x_gather.grad.detach_() x_grad = torch.zeros_like(x) dist.reduce_scatter( x_grad, list(x_gather.grad.chunk(cfg.world_size, dim=0))) x.backward(x_grad) optimizer.step() classifier_head.update() optimizer.zero_grad() if cfg.rank == 0: print(x_gather.grad.max(), x_gather.grad.min()) print('loss_v', loss_v.item(), global_step) writer.add_scalar('loss', loss_v, global_step) print('lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) print(cfg.batch_size / (time.time() - start)) global_step += 1 scheduler.step() if cfg.rank == 0: torch.save(backbone.module.state_dict(), "models/" + str(epoch) + 'backbone.pth') dist.destroy_process_group()
args.layers, NUM_DIRECTIONS, args.dropout, device).to(device) model = torch.nn.DataParallel(model) writer = SummaryWriter(comment="_nlp_%s_%s_%s" % (args.optimizer, args.batch_size, args.learning_rate)) weight_decay = args.learning_rate / args.epochs if args.optimizer == 'lamb': optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=weight_decay, betas=(.9, .999), adam=False, writer=writer) elif args.optimizer == 'lars': base_optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=weight_decay) optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001, writer=writer) elif args.optimizer == 'sgd': optimizer = SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=weight_decay, writer=writer) else: # use adam optimizer optimizer = Lamb(model.parameters(), lr=args.learning_rate, weight_decay=weight_decay, betas=(.9, .999), adam=True, writer=writer) print(f'The model has {count_parameters(model):,} trainable parameters') ckpt_dir_name = "%s_%s_%s" % (args.working_dir, args.optimizer, args.batch_size) model, optimizer = load_pretrained_model(model, optimizer, "%s/ckpt/%s" % (ckpt_dir_name, "best_weights.pt")) print(args) ckpt_dir = os.path.join(ckpt_dir_name, 'ckpt') os.makedirs(ckpt_dir, exist_ok=True) try: metrics = pickle.load(open(os.path.join(ckpt_dir, 'metrics.p'), 'rb'))
def __init__(self, classes, eta=0.01, iters=10, random_state=1): self.classifiers = {c: SGD(eta, iters, random_state) for c in classes} self.classes = classes
def main(local_rank): dist.init_process_group(backend='nccl', init_method='env://') cfg.local_rank = local_rank torch.cuda.set_device(local_rank) cfg.rank = dist.get_rank() cfg.world_size = dist.get_world_size() trainset = MXFaceDataset(root_dir=cfg.rec, local_rank=local_rank) train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, shuffle=True) train_loader = DataLoaderX(local_rank=local_rank, dataset=trainset, batch_size=cfg.batch_size, sampler=train_sampler, num_workers=0, pin_memory=True, drop_last=False) backbone = backbones.iresnet100(False).to(local_rank) backbone.train() # Broadcast init parameters for ps in backbone.parameters(): dist.broadcast(ps, 0) # DDP backbone = torch.nn.parallel.DistributedDataParallel( module=backbone, broadcast_buffers=False, device_ids=[cfg.local_rank]) backbone.train() # Memory classifer dist_sample_classifer = DistSampleClassifier(rank=dist.get_rank(), local_rank=local_rank, world_size=cfg.world_size) # Margin softmax margin_softmax = MarginSoftmax(s=64.0, m=0.4) # Optimizer for backbone and classifer optimizer = SGD([{ 'params': backbone.parameters() }, { 'params': dist_sample_classifer.parameters() }], lr=cfg.lr, momentum=0.9, weight_decay=cfg.weight_decay, rescale=cfg.world_size) # Lr scheduler scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=cfg.lr_func) n_epochs = cfg.num_epoch start_epoch = 0 if local_rank == 0: writer = SummaryWriter(log_dir='logs/shows') # total_step = int( len(trainset) / cfg.batch_size / dist.get_world_size() * cfg.num_epoch) if dist.get_rank() == 0: print("Total Step is: %d" % total_step) losses = AverageMeter() global_step = 0 train_start = time.time() for epoch in range(start_epoch, n_epochs): train_sampler.set_epoch(epoch) for step, (img, label) in enumerate(train_loader): total_label, norm_weight = dist_sample_classifer.prepare( label, optimizer) features = F.normalize(backbone(img)) # Features all-gather total_features = torch.zeros(features.size()[0] * cfg.world_size, cfg.embedding_size, device=local_rank) dist.all_gather(list(total_features.chunk(cfg.world_size, dim=0)), features.data) total_features.requires_grad = True # Calculate logits logits = dist_sample_classifer(total_features, norm_weight) logits = margin_softmax(logits, total_label) with torch.no_grad(): max_fc = torch.max(logits, dim=1, keepdim=True)[0] dist.all_reduce(max_fc, dist.ReduceOp.MAX) # Calculate exp(logits) and all-reduce logits_exp = torch.exp(logits - max_fc) logits_sum_exp = logits_exp.sum(dim=1, keepdims=True) dist.all_reduce(logits_sum_exp, dist.ReduceOp.SUM) # Calculate prob logits_exp.div_(logits_sum_exp) # Get one-hot grad = logits_exp index = torch.where(total_label != -1)[0] one_hot = torch.zeros(index.size()[0], grad.size()[1], device=grad.device) one_hot.scatter_(1, total_label[index, None], 1) # Calculate loss loss = torch.zeros(grad.size()[0], 1, device=grad.device) loss[index] = grad[index].gather(1, total_label[index, None]) dist.all_reduce(loss, dist.ReduceOp.SUM) loss_v = loss.clamp_min_(1e-30).log_().mean() * (-1) # Calculate grad grad[index] -= one_hot grad.div_(features.size()[0]) logits.backward(grad) if total_features.grad is not None: total_features.grad.detach_() x_grad = torch.zeros_like(features) # Feature gradient all-reduce dist.reduce_scatter( x_grad, list(total_features.grad.chunk(cfg.world_size, dim=0))) x_grad.mul_(cfg.world_size) # Backward backbone features.backward(x_grad) optimizer.step() # Update classifer dist_sample_classifer.update() optimizer.zero_grad() losses.update(loss_v, 1) if cfg.local_rank == 0 and step % 50 == 0: time_now = (time.time() - train_start) / 3600 time_total = time_now / ((global_step + 1) / total_step) time_for_end = time_total - time_now writer.add_scalar('time_for_end', time_for_end, global_step) writer.add_scalar('loss', loss_v, global_step) print( "Speed %d samples/sec Loss %.4f Epoch: %d Global Step: %d Required: %1.f hours" % ((cfg.batch_size * global_step / (time.time() - train_start) * cfg.world_size), losses.avg, epoch, global_step, time_for_end)) losses.reset() global_step += 1 scheduler.step() if dist.get_rank() == 0: import os if not os.path.exists(cfg.output): os.makedirs(cfg.output) torch.save(backbone.module.state_dict(), os.path.join(cfg.output, str(epoch) + 'backbone.pth')) dist.destroy_process_group()
# -*- coding: utf-8 -*- #!/bin/python3.5 """ The run.py file produces our final submission into a "submission.csv" in the data folder """ from als import ALS from sgd import SGD from helpers import create_csv_submission, load_data if __name__ == '__main__': # Initializing dataset print("Loading dataset") path_dataset = "data/data_train.csv" ratings = load_data(path_dataset) # Creating the sub_file with the best prediction # prediction, test_rmse = ALS(ratings, None, 3, 0.2, 0.9) prediction, test_rmse = SGD(ratings, None, 0.04, 9, 0.1, 0.016) create_csv_submission(prediction) print("Submission created at data/submission.csv")
import time from sgd import SGD from zfin import ZFin from worm import WormBase from fly import FlyBase from mouse import MGI from rat import RGD from mod import MOD sgd = SGD() zfin = ZFin() worm = WormBase() fly = FlyBase() mouse = MGI() rat = RGD() mod = MOD() mods = [mouse, zfin, sgd, worm, fly, rat] for m in mods: start_time = time.time() m.load_genes() print(" --- %s seconds --- " % (time.time() - start_time)) mod.load_homologs() for m in mods: start_time = time.time()
def online_learning_example(it): # Initialize weights to random value weight_gt = [2, -3] weight_final = np.random.randint(-10, 10, size=2) # weight_final = np.array([0, 2]) ground_truth_scm = StructuralCausalModel({ "init_x": lambda n_samples: np.random.normal(loc=8., scale=2.0, size=n_samples), "init_y": lambda n_samples: np.random.normal(loc=8., scale=2.0, size=n_samples), "push_x": lambda n_samples: np.random.normal(loc=0., scale=1.0, size=n_samples), "push_y": lambda n_samples: np.random.normal(loc=0., scale=1.0, size=n_samples), "final_x": linear_model(["init_x", "push_x"], weight_gt, noise_scale=.1), "final_y": linear_model(["init_y", "push_y"], weight_gt, noise_scale=.1), }) df_gt = ground_truth_scm.sample(n_samples=it) sgd = SGD(.001, 1, init_weights=weight_final) for i in range(it): gt_init_x = [df_gt.init_x[i]] gt_init_y = [df_gt.init_y[i]] gt_push_x = [df_gt.push_x[i]] gt_push_y = [df_gt.push_y[i]] gt_final_x = [df_gt.final_x[i]] gt_final_y = [df_gt.final_y[i]] intervention_vars = { "init_x": gt_init_x, "init_y": gt_init_y, "push_x": gt_push_x, "push_y": gt_push_y } pred_scm = StructuralCausalModel({ "init_x": lambda n_samples: np.random.normal( loc=8., scale=2.0, size=n_samples), "init_y": lambda n_samples: np.random.normal( loc=8., scale=2.0, size=n_samples), "push_x": lambda n_samples: np.random.normal( loc=0., scale=3.0, size=n_samples), "push_y": lambda n_samples: np.random.normal( loc=0., scale=3.0, size=n_samples), "final_x": linear_model(["init_x", "push_x"], weight_final, noise_scale=.1), "final_y": linear_model(["init_y", "push_y"], weight_final, noise_scale=.1), }) pred_scm_do = do_multiple(list(intervention_vars), pred_scm) df_pred = pred_scm_do.sample(n_samples=1, set_values=intervention_vars) pred_final_x = df_pred.final_x pred_final_y = df_pred.final_y plt.plot(df_gt.init_x[i], df_gt.init_y[i], 'bo') text = "True_weights: {}\n Predicted weights {}".format( weight_gt, weight_final) plt.text(df_gt.init_x[i] * (1 + 0.01), df_gt.init_y[i] * (1 + 0.01), text, fontsize=12) # plt.plot(df_gt.final_x, df_gt.final_y, 'go') plt.quiver(gt_init_x, gt_init_y, gt_final_x, gt_final_y, color="b") plt.quiver(gt_init_x, gt_init_y, pred_final_x, pred_final_y, color="r") weight_final, rmse_x = sgd.fit(gt_final_x, pred_final_x, [gt_init_x, gt_push_x]) # weight_final_y, rmse_y = sgd.fit(gt_final_y, gt_final_y) plt.pause(1.) plt.clf() plt.show()
def main(local_rank): cfg.local_rank = local_rank # cfg.rank = dist.get_rank() # cfg.world_size = dist.get_world_size() backbone = backbones.iresnet50(False) weights = torch.load("pytorch/partial_fc_glint360k_r50/16backbone.pth", map_location=torch.device('cpu')) backbone.load_state_dict(weights) backbone = backbone.float() backbone = backbone.eval() # embedding 512 img1 = cv2.imread('boy_1.jpg') img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB) img1 = image_preprocessing(img1) img1 = np.ones([112, 112, 3], dtype=np.float32) img1 = img1.transpose([2, 0, 1]) img1 = np.expand_dims(img1, axis=0) img1 = torch.from_numpy(img1).float() img1 = torch.autograd.Variable(img1, requires_grad=False).to('cpu') img2 = cv2.imread('man_2.jpg') img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB) img2 = image_preprocessing(img2) img2 = img2.transpose([2, 0, 1]) img2 = np.expand_dims(img2, axis=0) img2 = torch.from_numpy(img2).float() img2 = torch.autograd.Variable(img2, requires_grad=False).to('cpu') with torch.no_grad(): v1 = backbone.forward(img1) v2 = backbone.forward(img2) v1 = np.asarray(v1) import pickle pickle.dump(v1, open("sample.pkl", "wb")) print(v1) result = cosine_dist(v1, v2) print(result) exit(0) # Broadcast init parameters for ps in backbone.parameters(): dist.broadcast(ps, 0) # DDP backbone = torch.nn.parallel.DistributedDataParallel( module=backbone, broadcast_buffers=False, device_ids=[cfg.local_rank]) backbone.train() # Memory classifer dist_sample_classifer = DistSampleClassifier(rank=dist.get_rank(), local_rank=local_rank, world_size=cfg.world_size) # Margin softmax margin_softmax = MarginSoftmax(s=64.0, m=0.4) # Optimizer for backbone and classifer optimizer = SGD([{ 'params': backbone.parameters() }, { 'params': dist_sample_classifer.parameters() }], lr=cfg.lr, momentum=0.9, weight_decay=cfg.weight_decay, rescale=cfg.world_size) # Lr scheduler scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=cfg.lr_func) n_epochs = cfg.num_epoch start_epoch = 0 if local_rank == 0: writer = SummaryWriter(log_dir='logs/shows') # total_step = int( len(trainset) / cfg.batch_size / dist.get_world_size() * cfg.num_epoch) if dist.get_rank() == 0: print("Total Step is: %d" % total_step) losses = AverageMeter() global_step = 0 train_start = time.time() for epoch in range(start_epoch, n_epochs): train_sampler.set_epoch(epoch) for step, (img, label) in enumerate(train_loader): total_label, norm_weight = dist_sample_classifer.prepare( label, optimizer) features = F.normalize(backbone(img)) # Features all-gather total_features = torch.zeros(features.size()[0] * cfg.world_size, cfg.embedding_size, device=local_rank) dist.all_gather(list(total_features.chunk(cfg.world_size, dim=0)), features.data) total_features.requires_grad = True # Calculate logits logits = dist_sample_classifer(total_features, norm_weight) logits = margin_softmax(logits, total_label) with torch.no_grad(): max_fc = torch.max(logits, dim=1, keepdim=True)[0] dist.all_reduce(max_fc, dist.ReduceOp.MAX) # Calculate exp(logits) and all-reduce logits_exp = torch.exp(logits - max_fc) logits_sum_exp = logits_exp.sum(dim=1, keepdims=True) dist.all_reduce(logits_sum_exp, dist.ReduceOp.SUM) # Calculate prob logits_exp.div_(logits_sum_exp) # Get one-hot grad = logits_exp index = torch.where(total_label != -1)[0] one_hot = torch.zeros(index.size()[0], grad.size()[1], device=grad.device) one_hot.scatter_(1, total_label[index, None], 1) # Calculate loss loss = torch.zeros(grad.size()[0], 1, device=grad.device) loss[index] = grad[index].gather(1, total_label[index, None]) dist.all_reduce(loss, dist.ReduceOp.SUM) loss_v = loss.clamp_min_(1e-30).log_().mean() * (-1) # Calculate grad grad[index] -= one_hot grad.div_(features.size()[0]) logits.backward(grad) if total_features.grad is not None: total_features.grad.detach_() x_grad = torch.zeros_like(features) # Feature gradient all-reduce dist.reduce_scatter( x_grad, list(total_features.grad.chunk(cfg.world_size, dim=0))) x_grad.mul_(cfg.world_size) # Backward backbone features.backward(x_grad) optimizer.step() # Update classifer dist_sample_classifer.update() optimizer.zero_grad() losses.update(loss_v, 1) if cfg.local_rank == 0 and step % 50 == 0: time_now = (time.time() - train_start) / 3600 time_total = time_now / ((global_step + 1) / total_step) time_for_end = time_total - time_now writer.add_scalar('time_for_end', time_for_end, global_step) writer.add_scalar('loss', loss_v, global_step) print( "Speed %d samples/sec Loss %.4f Epoch: %d Global Step: %d Required: %1.f hours" % ((cfg.batch_size * global_step / (time.time() - train_start) * cfg.world_size), losses.avg, epoch, global_step, time_for_end)) losses.reset() global_step += 1 scheduler.step() if dist.get_rank() == 0: import os if not os.path.exists(cfg.output): os.makedirs(cfg.output) torch.save(backbone.module.state_dict(), os.path.join(cfg.output, str(epoch) + 'backbone.pth')) dist.destroy_process_group()
hidden_size = 100 time_size = 35 learning_rate = 20.0 max_epoch = 4 max_grad = 0.25 # Load trainig data corpus, word_to_id, id_to_word = load_data('train') corpus_test, *_ = load_data('test') vocab_size = len(word_to_id) xs = corpus[:-1] ts = corpus[1:] # Generate a model, optimiser and trainer model = RNNLM(vocab_size, wordvec_size, hidden_size) optimiser = SGD(learning_rate) trainer = RNNLMTrainer(model, optimiser) # 1. Train applying gradients clipping training_process = trainer.fit(xs, ts, max_epoch, batch_size, time_size, max_grad, eval_interval=20) for iter in training_process: print(iter) file_path = '../img/train_rnnlm.png' tainer.save_plot_image(file_path, ylim=(0, 500))
def main(): print('=========================================') print(' Numpy DNN ') print(' 26/Nov/2017 ') print(' By Thang Vu ([email protected]) ') print('=========================================') # load datasets path = 'data/mnist.pkl.gz' train_set, val_set, test_set = load_mnist_datasets(path) batch_size = 128 X_train, y_train = train_set X_val, y_val = val_set X_test, y_test = test_set # bookeeping for best model based on validation set best_val_acc = -1 best_model = None # create model and optimization method dnn = DNN() sgd = SGD(lr=0.1, lr_decay=0.1, weight_decay=1e-3, momentum=0.9) # Train batch_size = 128 for epoch in range(20): dnn.train_mode() # set model to train mode (because of dropout) num_train = X_train.shape[0] num_batch = num_train//batch_size for batch in range(num_batch): # get batch data batch_mask = np.random.choice(num_train, batch_size) X_batch = X_train[batch_mask] y_batch = y_train[batch_mask] # forward output = dnn.forward(X_batch) loss, dout = softmax_cross_entropy_loss(output, y_batch) if batch%100 == 0: print("Epoch %2d Iter %3d Loss %.5f" %(epoch, batch, loss)) # backward and update grads = dnn.backward(dout) sgd.step(dnn.params, grads) sgd.decay_learning_rate() # decay learning rate after one epoch dnn.eval_mode() # set model to eval mode train_acc = check_acc(dnn, X_train, y_train) val_acc = check_acc(dnn, X_val, y_val) if(best_val_acc < val_acc): best_val_acc = val_acc best_model = dnn # store best model based n acc_val print('Epoch finish. ') print('Train acc %.3f' %train_acc) print('Val acc %.3f' %val_acc) print('-'*30) print('') print('Train finished. Best acc %.3f' %best_val_acc) test_acc = check_acc(best_model, X_test, y_test) print('Test acc %.3f' %test_acc)
plot_model(model, to_file='model_imdb.png', show_shapes=True) results_acc = [] result_acc = [] results_loss = [] result_loss = [] test_acc_results = [] test_loss_results = [] l = [ Adam(lr=0.001, amsgrad=True), AAdam(lr=0.001, amsgrad=True), Adam(lr=0.001, amsgrad=False), AAdam(lr=0.001, amsgrad=False), Adagrad(), AAdagrad(), SGD(), ASGD() ] #, Adam(lr=0.001, amsgrad = True), AAdam(lr=0.001, amsgrad = True)] for opt in l: model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) #model.save_weights('initial_weights_imdb.h5') model.load_weights('initial_weights_imdb.h5') initial_weights = model.get_weights() result_acc = [] result_loss = [] test_loss = [] test_acc = []
def setUp(self): model = TwoLayerNet(input_size=2, hidden_size=10, output_size=3) optimizer = SGD(lr=1.0) self.trainer = Trainer(model, optimizer) self.x, self.t = load_data() self.data_size = len(self.x)