def parse_args(): parser = argparse.ArgumentParser( description='umt.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) opts.add_md_help_argument(parser) opts.model_opts(parser) opts.preprocess_opts(parser) opts.train_opts(parser) opt = parser.parse_args() torch.manual_seed(opt.seed) if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers opt.brnn = (opt.encoder_type == "brnn") # if opt.seed > 0: random.seed(opt.seed) torch.manual_seed(opt.seed) if torch.cuda.is_available() and not opt.gpuid: print("WARNING: You have a CUDA device, should run with -gpuid 0") if opt.gpuid: cuda.set_device(opt.gpuid[0]) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) if len(opt.gpuid) > 1: sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n") sys.exit(1) # Set up the Crayon logging server. if opt.exp_host != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.exp_host) experiments = cc.get_experiment_names() print(experiments) if opt.exp in experiments: cc.remove_experiment(opt.exp) return opt
class Monitor(object): def __init__(self, address, port): self.cc = CrayonClient(hostname=address, port=port) def start_experiment(self, name, clean=True): exps = self.cc.get_experiment_names() if name in exps: if clean: self.cc.remove_experiment(name) self.exp = self.cc.create_experiment(name) print 'clean and creat a new one' else: self.exp = self.cc.open_experiment(name) else: self.exp = self.cc.create_experiment(name) def push(self, data, wall_time=-1, step=-1): self.exp.add_scalar_dict(data, wall_time, step)
def get_crayon_experiment(exp_name, hostname='127.0.0.1', overwrite=True): cc = CrayonClient(hostname=hostname) cc_exp = None experiments = cc.get_experiment_names() if exp_name in experiments: if overwrite: cc.remove_experiment(exp_name) cc_exp = cc.create_experiment(exp_name) else: cc_exp = cc.open_experiment(exp_name) else: try: cc_exp = cc.create_experiment(exp_name) except ValueError: cc.remove_experiment(exp_name) cc_exp = cc.create_experiment(exp_name) return cc_exp
if opt.gpuid: cuda.set_device(opt.gpuid[0]) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) if len(opt.gpuid) > 1: sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n") sys.exit(1) # Set up the Crayon logging server. if opt.exp_host != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.exp_host) experiments = cc.get_experiment_names() print(experiments) if opt.exp in experiments: cc.remove_experiment(opt.exp) experiment = cc.create_experiment(opt.exp) if opt.tensorboard: from tensorboardX import SummaryWriter writer = SummaryWriter( opt.tensorboard_log_dir + datetime.now().strftime("/%b-%d_%H-%M-%S"), comment="Onmt") progress_step = 0 def report_func(epoch, batch, num_batches,
net.parameters()), lr=lr) if not os.path.exists(output_dir): os.mkdir(output_dir) # tensorboad use_tensorboard = use_tensorboard and CrayonClient is not None if use_tensorboard: cc = CrayonClient(hostname='127.0.0.1') if remove_all_log: cc.remove_all_experiments() if exp_name is None: exp_name = datetime.now().strftime('vgg16_%m-%d_%H-%M') exp_name = save_exp_name if exp_name in cc.get_experiment_names(): cc.remove_experiment(exp_name) exp = cc.create_experiment(exp_name) else: exp = cc.open_experiment(exp_name) # training train_loss = 0 step_cnt = 0 re_cnt = False t = Timer() t.tic() best_mae = sys.maxsize for epoch in range(start_step, end_step + 1):
def main(): # Set up the Crayon logging server. if opt.log_server != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.log_server) experiments = cc.get_experiment_names() print(experiments) if opt.experiment_name in experiments: cc.remove_experiment(opt.experiment_name) opt.experiment_name = cc.create_experiment(opt.experiment_name) print("Loading data from '%s'" % opt.data) dataset = torch.load(opt.data) dict_checkpoint = (opt.train_from if opt.train_from else opt.train_from_state_dict) if dict_checkpoint: print('Loading dicts from checkpoint at %s' % dict_checkpoint) checkpoint = torch.load(dict_checkpoint, map_location=lambda storage, loc: storage) #dataset['dicts'] = checkpoint['dicts'] if opt.redis: trainData = onmt.RedisDataset("train", opt.batch_size, False, reverse=opt.reverse, port=opt.port, db=opt.db, r2l=opt.r2l) validData = onmt.RedisDataset('valid', opt.batch_size, False, volatile=True, reverse=opt.reverse, port=opt.port, r2l=opt.r2l, db=opt.db) else: trainData = onmt.Dataset(dataset['train']['src'], dataset['train']['tgt'], opt.batch_size, False, data_type=dataset.get("type", "text"), srcFeatures=dataset['train'].get('src_features'), tgtFeatures=dataset['train'].get('tgt_features'), alignment=dataset['train'].get('alignments')) validData = onmt.Dataset(dataset['valid']['src'], dataset['valid']['tgt'], opt.batch_size, False, volatile=True, data_type=dataset.get("type", "text"), srcFeatures=dataset['valid'].get('src_features'), tgtFeatures=dataset['valid'].get('tgt_features'), alignment=dataset['valid'].get('alignments')) dicts = dataset['dicts'] if opt.reverse: dicts['src'], dicts['tgt'] = dicts['tgt'], dicts['src'] dicts['src_features'], dicts['tgt_features'] = dicts['tgt_features'], dicts['src_features'] print(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) #if 'src_features' in dicts: # for j in range(len(dicts['src_features'])): # print(' * src feature %d size = %d' % # (j, dicts['src_features'][j].size())) #print(' * number of training sentences. %d' % #len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') if opt.encoder_type == "text": encoder = onmt.Models.Encoder(opt, dicts['src'], dicts.get('src_features', None)) elif opt.encoder_type == "img": encoder = onmt.modules.ImageEncoder(opt) assert("type" not in dataset or dataset["type"] == "img") else: print("Unsupported encoder type %s" % (opt.encoder_type)) decoder = onmt.Models.Decoder(opt, dicts['tgt']) if opt.copy_attn: generator = onmt.modules.CopyGenerator(opt, dicts['src'], dicts['tgt']) else: generator = nn.Sequential( nn.Linear(opt.rnn_size, dicts['tgt'].size()), nn.LogSoftmax()) if opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight model = onmt.Models.NMTModel(encoder, decoder, len(opt.gpus) > 1) if opt.train_from: print('Loading model from checkpoint at %s' % opt.train_from) chk_model = checkpoint['model'] generator_state_dict = chk_model.generator.state_dict() model_state_dict = {k: v for k, v in chk_model.state_dict().items() if 'generator' not in k} model.load_state_dict(model_state_dict) generator.load_state_dict(generator_state_dict) opt.start_epoch = checkpoint['epoch'] + 1 if opt.train_from_state_dict: print('Loading model from checkpoint at %s' % opt.train_from_state_dict) model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) opt.start_epoch = checkpoint['epoch'] + 1 model.cpu() generator.cpu() model.generator = generator if not opt.train_from_state_dict and not opt.train_from: if opt.param_init != 0.0: print('Intializing params') for p in model.parameters(): p.data.uniform_(-opt.param_init, opt.param_init) encoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_enc) decoder.embeddings.load_pretrained_vectors(opt.pre_word_vecs_dec) optim = onmt.Optim( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, opt=opt ) else: print('Loading optimizer from checkpoint:') optim = checkpoint['optim'] print(optim) if opt.train_from or opt.train_from_state_dict: optim.optimizer.load_state_dict( checkpoint['optim'].optimizer.state_dict()) print('Multi gpu training ', opt.gpus) trainer = MultiprocessingTrainer(opt, model, optim, device_ids=opt.gpus) nParams = sum([p.nelement() for p in model.parameters()]) print('* number of parameters: %d' % nParams) enc = 0 dec = 0 for name, param in model.named_parameters(): if 'encoder' in name: enc += param.nelement() elif 'decoder' in name: dec += param.nelement() else: print(name, param.nelement()) print('encoder: ', enc) print('decoder: ', dec) trainModel(trainer, trainData, validData, dataset)
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
def eval_loop(counter, args, shared_model, model_eval): try: SEC_PER_DAY = 24 * 60 * 60 env = build_env(args.type, args, treat_life_lost_as_terminal=False, max_time=5 * 60) model = copy.deepcopy(shared_model) model.eval() # Create a new experiment vis = visdom.Visdom(env='A3C:' + args.name) cc = CrayonClient() names = cc.get_experiment_names() summaries = [] for idx in range(args.n_eval): name = "{} [{}]".format(args.name, idx + 1) if name in names: cc.remove_experiment(name) summaries.append(cc.create_experiment(name)) max_reward = None save_condition = args.save_intervel rewards = [] start_time = time.time() while True: # Sync with the shared model model.load_state_dict(shared_model.state_dict()) restart, eval_start_time, eval_start_step = False, time.time( ), counter.value results = [] for i in range(args.n_eval): model.reset_state() results.append(model_eval(model, env, vis=(vis, i + 1, 60))) if env.exceed_max: restart = True env.reset() break env.reset() if restart: continue eval_end_time, eval_end_step = time.time(), counter.value results = EvalResult(*zip(*results)) rewards.append((counter.value, results.reward)) local_max_reward = np.max(results.reward) if max_reward is None or max_reward < local_max_reward: max_reward = local_max_reward if local_max_reward >= max_reward: # Save model torch.save(model.state_dict(), os.path.join(args.model_path, 'best_model.pth')) time_since_start = eval_end_time - start_time day = time_since_start // SEC_PER_DAY time_since_start %= SEC_PER_DAY seconds_to_finish = (args.n_steps - eval_end_step) / ( eval_end_step - eval_start_step) * (eval_end_time - eval_start_time) days_to_finish = seconds_to_finish // SEC_PER_DAY seconds_to_finish %= SEC_PER_DAY print("STEP:[{}|{}], Time: {}d {}, Finish in {}d {}".format( counter.value, args.n_steps, '%02d' % day, time.strftime("%Hh %Mm %Ss", time.gmtime(time_since_start)), '%02d' % days_to_finish, time.strftime("%Hh %Mm %Ss", time.gmtime(seconds_to_finish)))) print( '\tMax reward: {}, avg_reward: {}, std_reward: {}, min_reward: {}, max_reward: {}' .format(max_reward, np.mean(results.reward), np.std(results.reward), np.min(results.reward), local_max_reward)) # Plot for summary, reward in zip(summaries, results.reward): summary.add_scalar_value('reward', reward, step=eval_start_step) if counter.value > save_condition or counter.value >= args.n_steps: save_condition += args.save_intervel torch.save( model.state_dict(), os.path.join(args.model_path, 'model_iter_{}.pth'.format(counter.value))) torch.save(model.state_dict(), os.path.join(args.model_path, 'model_latest.pth')) with open(os.path.join(args.save_path, 'rewards'), 'a+') as f: for record in rewards: f.write('{}: {}\n'.format(record[0], record[1])) del rewards[:] if counter.value >= args.n_steps: print('Evaluator Finished !!!') break except KeyboardInterrupt: torch.save(shared_model.state_dict(), os.path.join(args.model_path, 'model_latest.pth')) raise
print("validation loss = {0:.10}, validation accuracy = {1:.5}". format(loss_score, acc_score)) print("answerer loss = {0:.10}, discrim. loss = {1:.10}". format(ans_loss_score, dis_loss_score)) loss_score, ans_loss_score, dis_loss_score, acc_score = validate_bireader(net, test_loader, params) print("validation loss = {0:.10}, validation accuracy = {1:.5}". format(loss_score, acc_score)) print("answerer loss = {0:.10}, discrim. loss = {1:.10}". format(ans_loss_score, dis_loss_score)) else: if arg.log: # crayon client cc = CrayonClient(hostname="localhost", port=8889) existing = len(cc.get_experiment_names()) ce = cc.create_experiment("run_{0}".format(existing), zip_file=None) print("now training...") train_1 = pd.read_pickle("../input_data/train_{0}.pkl".format(params['lang'])) train_2 = pd.read_pickle("../input_data/train_{0}.pkl".format(params['lang2'])) train_loader = tud.DataLoader(BiQADataset(train_1, train_2, nlp_1, nlp_2, rev_dic_1, rev_dic_2, relabel=params['relabel'], l2_supersample=params['l2_supersample']), batch_size=params['batch_size'], pin_memory=True, num_workers=3, shuffle=True) dev_1 = pd.read_pickle("../input_data/dev_{0}.pkl".format(params['lang'])) dev_2 = pd.read_pickle("../input_data/dev_{0}.pkl".format(params['lang2'])) dev_loader = tud.DataLoader(BiQADataset(dev_1, dev_2, nlp_1, nlp_2,
def main(args): parser = argparse.ArgumentParser( description='train.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # opts.py opts.add_md_help_argument(parser) opts.model_opts(parser) opts.train_opts(parser) opt = parser.parse_args(args) if opt.word_vec_size != -1: opt.src_word_vec_size = opt.word_vec_size opt.tgt_word_vec_size = opt.word_vec_size if opt.layers != -1: opt.enc_layers = opt.layers opt.dec_layers = opt.layers opt.brnn = (opt.encoder_type == "brnn") if opt.seed > 0: random.seed(opt.seed) torch.manual_seed(opt.seed) if opt.rnn_type == "SRU" and not opt.gpuid: raise AssertionError("Using SRU requires -gpuid set.") if torch.cuda.is_available() and not opt.gpuid: print("WARNING: You have a CUDA device, should run with -gpuid 0") if opt.gpuid: cuda.set_device(opt.gpuid[0]) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) if len(opt.gpuid) > 1: sys.stderr.write("Sorry, multigpu isn't supported yet, coming soon!\n") sys.exit(1) # Set up the Crayon logging server. experiment = None if opt.exp_host != "": from pycrayon import CrayonClient cc = CrayonClient(hostname=opt.exp_host) experiments = cc.get_experiment_names() print(experiments) if opt.exp in experiments: cc.remove_experiment(opt.exp) experiment = cc.create_experiment(opt.exp) writer = None if opt.tensorboard: from tensorboardX import SummaryWriter writer = SummaryWriter(opt.tensorboard_log_dir, comment="Onmt") # Load checkpoint if we resume from a previous training. if opt.train_from: print('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) model_opt = checkpoint['opt'] # I don't like reassigning attributes of opt: it's not clear. opt.start_epoch = checkpoint['epoch'] + 1 else: checkpoint = None model_opt = opt # Peek the fisrt dataset to determine the data_type. # (All datasets have the same data_type). first_dataset = next(lazily_load_dataset("train", opt)) data_type = first_dataset.data_type # Load fields generated from preprocess phase. fields = load_fields(first_dataset, data_type, checkpoint, opt) # Report src/tgt features. collect_report_features(fields) # Build model. model = build_model(model_opt, opt, fields, checkpoint) tally_parameters(model) check_save_model_path(opt) # Build optimizer. optim = build_optim(model, checkpoint, opt) # Do training. train_model(model, fields, optim, data_type, model_opt, experiment, writer, opt) # If using tensorboard for logging, close the writer after training. if opt.tensorboard: writer.close()