def train_reconstruction(args): device = torch.device(args.gpu) print("Loading embedding model...") with open( os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'word_embedding.p'), "rb") as f: embedding_model = cPickle.load(f) with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset, 'word_idx.json'), "r", encoding='utf-8') as f: word_idx = json.load(f) print("Loading embedding model completed") print("Loading dataset...") train_dataset, val_dataset = load_text_data(args, CONFIG, word2idx=word_idx[1]) print("Loading dataset completed") train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\ DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) # t1 = max_sentence_len + 2 * (args.filter_shape - 1) t1 = CONFIG.MAX_SENTENCE_LEN t2 = int(math.floor( (t1 - args.filter_shape) / 2) + 1) # "2" means stride size t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1) args.t3 = t3 embedding = nn.Embedding.from_pretrained( torch.FloatTensor(embedding_model)) text_encoder = text_model.ConvolutionEncoder(embedding, t3, args.filter_size, args.filter_shape, args.latent_size) text_decoder = text_model.DeconvolutionDecoder(embedding, args.tau, t3, args.filter_size, args.filter_shape, args.latent_size, device) if args.resume: print("Restart from checkpoint") checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH, args.resume), map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] text_encoder.load_state_dict(checkpoint['text_encoder']) text_decoder.load_state_dict(checkpoint['text_decoder']) else: print("Start from initial") start_epoch = 0 text_autoencoder = text_model.TextAutoencoder(text_encoder, text_decoder) criterion = nn.NLLLoss().to(device) text_autoencoder.to(device) optimizer = AdamW(text_autoencoder.parameters(), lr=1., weight_decay=args.weight_decay, amsgrad=True) step_size = args.half_cycle_interval * len(train_loader) clr = cyclical_lr(step_size, min_lr=args.lr, max_lr=args.lr * args.lr_factor) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) exp = Experiment("Text autoencoder " + str(args.latent_size), capture_io=False) for arg, value in vars(args).items(): exp.param(arg, value) try: text_autoencoder.train() for epoch in range(start_epoch, args.epochs): print("Epoch: {}".format(epoch)) for steps, batch in enumerate(train_loader): torch.cuda.empty_cache() feature = Variable(batch).to(device) optimizer.zero_grad() prob = text_autoencoder(feature) loss = criterion(prob.transpose(1, 2), feature) loss.backward() optimizer.step() scheduler.step() if (steps * args.batch_size) % args.log_interval == 0: input_data = feature[0] single_data = prob[0] _, predict_index = torch.max(single_data, 1) input_sentence = util.transform_idx2word( input_data.detach().cpu().numpy(), idx2word=word_idx[0]) predict_sentence = util.transform_idx2word( predict_index.detach().cpu().numpy(), idx2word=word_idx[0]) print("Epoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) print("Steps: {}".format(steps)) print("Loss: {}".format(loss.detach().item())) print("Input Sentence:") print(input_sentence) print("Output Sentence:") print(predict_sentence) del input_data, single_data, _, predict_index del feature, prob, loss exp.log("\nEpoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) _avg_loss, _rouge_1, _rouge_2 = eval_reconstruction_with_rouge( text_autoencoder, word_idx[0], criterion, val_loader, device) exp.log("\nEvaluation - loss: {} Rouge1: {} Rouge2: {}".format( _avg_loss, _rouge_1, _rouge_2)) util.save_models( { 'epoch': epoch + 1, 'text_encoder': text_encoder.state_dict(), 'text_decoder': text_decoder.state_dict(), 'avg_loss': _avg_loss, 'Rouge1:': _rouge_1, 'Rouge2': _rouge_2, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, CONFIG.CHECKPOINT_PATH, "text_autoencoder_" + str(args.latent_size)) print("Finish!!!") finally: exp.end()
def train_reconstruction(args): device = torch.device(args.gpu) print("Loading dataset...") train_dataset, val_dataset = load_imgseq_data(args, CONFIG) print("Loading dataset completed") train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\ DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) #imgseq_encoder = imgseq_model.RNNEncoder(args.embedding_dim, args.num_layer, args.latent_size, bidirectional=True) #imgseq_decoder = imgseq_model.RNNDecoder(CONFIG.MAX_SEQUENCE_LEN, args.embedding_dim, args.num_layer, args.latent_size, bidirectional=True) t1 = CONFIG.MAX_SEQUENCE_LEN t2 = int(math.floor((t1 - 3) / 1) + 1) # "2" means stride size t3 = int(math.floor((t2 - 3) / 1) + 1) imgseq_encoder = imgseq_model.ConvolutionEncoder( embedding_dim=args.embedding_dim, t3=t3, filter_size=300, filter_shape=3, latent_size=1000) imgseq_decoder = imgseq_model.DeconvolutionDecoder( embedding_dim=args.embedding_dim, t3=t3, filter_size=300, filter_shape=3, latent_size=1000) if args.resume: print("Restart from checkpoint") checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH, args.resume), map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] imgseq_encoder.load_state_dict(checkpoint['imgseq_encoder']) imgseq_decoder.load_state_dict(checkpoint['imgseq_decoder']) else: print("Start from initial") start_epoch = 0 imgseq_autoencoder = imgseq_model.ImgseqAutoEncoder( imgseq_encoder, imgseq_decoder) criterion = nn.MSELoss().to(device) imgseq_autoencoder.to(device) optimizer = AdamW(imgseq_autoencoder.parameters(), lr=1., weight_decay=args.weight_decay, amsgrad=True) step_size = args.half_cycle_interval * len(train_loader) clr = cyclical_lr(step_size, min_lr=args.lr, max_lr=args.lr * args.lr_factor) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) exp = Experiment("Image-sequence autoencoder " + str(args.latent_size), capture_io=False) for arg, value in vars(args).items(): exp.param(arg, value) try: imgseq_autoencoder.train() for epoch in range(start_epoch, args.epochs): print("Epoch: {}".format(epoch)) for steps, batch in enumerate(train_loader): torch.cuda.empty_cache() feature = Variable(batch).to(device) optimizer.zero_grad() feature_hat = imgseq_autoencoder(feature) loss = criterion(feature_hat, feature) loss.backward() optimizer.step() scheduler.step() if (steps * args.batch_size) % args.log_interval == 0: print("Epoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) print("Steps: {}".format(steps)) print("Loss: {}".format(loss.detach().item())) input_data = feature[0] del feature, feature_hat, loss exp.log("\nEpoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) _avg_loss = eval_reconstruction(imgseq_autoencoder, criterion, val_loader, device) exp.log("\nEvaluation - loss: {}".format(_avg_loss)) util.save_models( { 'epoch': epoch + 1, 'imgseq_encoder': imgseq_encoder.state_dict(), 'imgseq_decoder': imgseq_decoder.state_dict(), 'avg_loss': _avg_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, CONFIG.CHECKPOINT_PATH, "imgseq_autoencoder_" + str(args.latent_size)) print("Finish!!!") finally: exp.end()
def main(): args = parse_args() # set random seed #logger.info('> set random seed {}'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) # Set up Devices #logger.info('> set gpu device {}'.format(args.gpus)) num_cuda_devices = utils.set_devices(args.gpus) # Load model #logger.info('> load model {}'.format(args.model_name)) ext = os.path.splitext(args.model_file)[1] model_path = '.'.join(os.path.split(args.model_file)).replace(ext, '') model = import_module(model_path) model = getattr(model, args.model_name)(args.output_class) if num_cuda_devices > 0: model = torch.nn.DataParallel(model) model.cuda() logger.info('> set optimizer') criterion = torch.nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.initial_lr, momentum=args.lr_momentum) # Create result dir result_dir = create_result_dir(args.model_name) fh_handler = logging.FileHandler(os.path.join(result_dir, "log")) fh_handler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s %(message)s')) logger.addHandler(fh_handler) shutil.copy(args.model_file, os.path.join(result_dir, os.path.basename(args.model_file))) script_file_list = glob.glob('./*.py') + glob.glob('./*.sh') for file_name in script_file_list: shutil.copy(file_name, os.path.join(result_dir, os.path.basename(file_name))) with open(os.path.join(result_dir, 'args'), 'w') as fp: fp.write(json.dumps(vars(args))) print(json.dumps(vars(args), sort_keys=True, indent=4)) # Create Dataset logger.info('> Creating DataSet') train_transform = partial(transforms.transform_f, random_angle=args.random_angle, expand_ratio=args.expand_ratio, crop_size=args.crop_size, train=True) train = getdataset.getCcoreDataset(args.train_json, train_transform, args.train_mode) val_transform = partial(transforms.transform_f, random_angle=args.random_angle, expand_ratio=args.expand_ratio, crop_size=args.crop_size, train=True) val = getdataset.getCcoreDataset(args.train_json, val_transform, args.train_mode) # Create DataLoader logger.info('> create dataloader') train_loader = torch.utils.data.DataLoader(train, batch_size=args.batchsize, shuffle=True, num_workers=4) val_loader = torch.utils.data.DataLoader(val, batch_size=args.batchsize, shuffle=False, num_workers=4) # Training logger.info('> run training') best_prec = 0 # Create Hyperdash Experiment logger.info('> Create Hyperdash Experiment {}'.format( args.experiment_name)) exp = Experiment(args.experiment_name, api_key_getter=utils.get_api_key_from_env) for epoch in tqdm(range(args.training_epoch)): training_result = training(train_loader, model, criterion, optimizer) val_result = validate(val_loader, model, criterion) result_str = 'epoch : {} / {}\ main/loss : {:.3f}\ main/acc : {:.3f}\ val/loss : {:.3f}\ val/acc : {:.3f}'.format(epoch, args.training_epoch, training_result['loss'], training_result['acc'], val_result['loss'], val_result['acc']) logger.info(result_str) exp.log(result_str) prec1 = val_result['acc'] # remember best prec@1 and save checkpoint is_best = prec1 > best_prec best_prec = max(prec1, best_prec) if is_best: save_checkpoint( state={ 'epoch': epoch + 1, #'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec': best_prec, 'optimizer': optimizer.state_dict(), }, is_best=is_best, result_dir=result_dir) exp.metric('main/loss', training_result['loss']) exp.metric('val/loss', val_result['loss']) logger.info('> end training') exp.end()
def test_experiment(self): # Run a test job via the Experiment API # Make sure log file is where is supposed to be # look at decorator # verify run start/stop is sent with patch("sys.stdout", new=StringIO()) as faked_out: exp = Experiment("MNIST") exp.log("test print") exp.param("batch size", 32) for i in exp.iter(2): time.sleep(1) exp.metric("accuracy", i * 0.2) time.sleep(0.1) exp.end() # Test params match what is expected params_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "params" in payload: params_messages.append(payload) expect_params = [ { "params": { "batch size": 32, }, "is_internal": False, }, { "params": { "hd_iter_0_epochs": 2, }, "is_internal": True, }, ] assert len(expect_params) == len(params_messages) for i, message in enumerate(params_messages): assert message == expect_params[i] # Test metrics match what is expected metrics_messages = [] for msg in server_sdk_messages: payload = msg["payload"] if "name" in payload: metrics_messages.append(payload) expect_metrics = [ { "is_internal": True, "name": "hd_iter_0", "value": 0 }, { "is_internal": False, "name": "accuracy", "value": 0 }, { "is_internal": True, "name": "hd_iter_0", "value": 1 }, { "is_internal": False, "name": "accuracy", "value": 0.2 }, ] assert len(expect_metrics) == len(metrics_messages) for i, message in enumerate(metrics_messages): assert message == expect_metrics[i] captured_out = faked_out.getvalue() assert "error" not in captured_out # Make sure correct API name / version headers are sent assert server_sdk_headers[0][API_KEY_NAME] == API_NAME_EXPERIMENT assert server_sdk_headers[0][ VERSION_KEY_NAME] == get_hyperdash_version() # Make sure logs were persisted expect_logs = [ "{ batch size: 32 }", "test print", "| Iteration 0 of 1 |", "| accuracy: 0.000000 |", ] log_dir = get_hyperdash_logs_home_path_for_job("MNIST") latest_log_file = max([ os.path.join(log_dir, filename) for filename in os.listdir(log_dir) ], key=os.path.getmtime) with open(latest_log_file, "r") as log_file: data = log_file.read() for log in expect_logs: assert_in(log, data) os.remove(latest_log_file)
def train_reconstruction(args): device = torch.device(args.gpu) print("Loading dataset...") train_dataset, val_dataset = load_image_pretrain_data(args, CONFIG) print("Loading dataset completed") train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\ DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True) # image_encoder = ImageEncoder() # image_encoder.init_weights() # image_decoder = ImageDecoder() image_encoder = ResNet50Encoder() image_encoder.init_weights() image_decoder = ResNet50Decoder() if args.resume: print("Restart from checkpoint") checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH, args.resume), map_location=lambda storage, loc: storage) start_epoch = checkpoint['epoch'] image_encoder.load_state_dict(checkpoint['image_encoder']) image_decoder.load_state_dict(checkpoint['image_decoder']) else: print("Start from initial") start_epoch = 0 image_autoencoder = ResNet_autoencoder(image_encoder, image_decoder) criterion = nn.MSELoss().to(device) image_autoencoder.to(device) optimizer = AdamW(image_autoencoder.parameters(), lr=1., weight_decay=args.weight_decay, amsgrad=True) step_size = args.half_cycle_interval * len(train_loader) clr = cyclical_lr(step_size, min_lr=args.lr, max_lr=args.lr * args.lr_factor) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr]) if args.resume: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) exp = Experiment("Image-sequence Component Pretrain " + str(args.latent_size), capture_io=False) for arg, value in vars(args).items(): exp.param(arg, value) try: image_autoencoder.train() for epoch in range(start_epoch, args.epochs): print("Epoch: {}".format(epoch)) for steps, batch in enumerate(train_loader): torch.cuda.empty_cache() feature = Variable(batch).to(device) optimizer.zero_grad() feature_hat = image_autoencoder(feature) loss = criterion(feature_hat, feature) loss.backward() optimizer.step() scheduler.step() if (steps * args.batch_size) % args.log_interval == 0: print("Epoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) print("Steps: {}".format(steps)) print("Loss: {}".format(loss.detach().item())) del feature, feature_hat, loss exp.log("\nEpoch: {} at {} lr: {}".format( epoch, str(datetime.datetime.now()), str(scheduler.get_lr()))) _avg_loss = eval_reconstruction(image_autoencoder, criterion, val_loader, device, epoch) exp.log("\nEvaluation - loss: {}".format(_avg_loss)) util.save_models( { 'epoch': epoch + 1, 'image_encoder': image_encoder.state_dict(), 'image_decoder': image_decoder.state_dict(), 'avg_loss': _avg_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, CONFIG.CHECKPOINT_PATH, "image_pretrain" + str(args.latent_size)) print("Finish!!!") finally: exp.end()