Пример #1
0
def train_reconstruction(args):
    device = torch.device(args.gpu)
    print("Loading embedding model...")
    with open(
            os.path.join(CONFIG.DATASET_PATH, args.target_dataset,
                         'word_embedding.p'), "rb") as f:
        embedding_model = cPickle.load(f)
    with open(os.path.join(CONFIG.DATASET_PATH, args.target_dataset,
                           'word_idx.json'),
              "r",
              encoding='utf-8') as f:
        word_idx = json.load(f)
    print("Loading embedding model completed")
    print("Loading dataset...")
    train_dataset, val_dataset = load_text_data(args,
                                                CONFIG,
                                                word2idx=word_idx[1])
    print("Loading dataset completed")
    train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\
             DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

    # t1 = max_sentence_len + 2 * (args.filter_shape - 1)
    t1 = CONFIG.MAX_SENTENCE_LEN
    t2 = int(math.floor(
        (t1 - args.filter_shape) / 2) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
    args.t3 = t3
    embedding = nn.Embedding.from_pretrained(
        torch.FloatTensor(embedding_model))
    text_encoder = text_model.ConvolutionEncoder(embedding, t3,
                                                 args.filter_size,
                                                 args.filter_shape,
                                                 args.latent_size)
    text_decoder = text_model.DeconvolutionDecoder(embedding, args.tau, t3,
                                                   args.filter_size,
                                                   args.filter_shape,
                                                   args.latent_size, device)
    if args.resume:
        print("Restart from checkpoint")
        checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                             args.resume),
                                map_location=lambda storage, loc: storage)
        start_epoch = checkpoint['epoch']
        text_encoder.load_state_dict(checkpoint['text_encoder'])
        text_decoder.load_state_dict(checkpoint['text_decoder'])
    else:
        print("Start from initial")
        start_epoch = 0

    text_autoencoder = text_model.TextAutoencoder(text_encoder, text_decoder)
    criterion = nn.NLLLoss().to(device)
    text_autoencoder.to(device)

    optimizer = AdamW(text_autoencoder.parameters(),
                      lr=1.,
                      weight_decay=args.weight_decay,
                      amsgrad=True)
    step_size = args.half_cycle_interval * len(train_loader)
    clr = cyclical_lr(step_size,
                      min_lr=args.lr,
                      max_lr=args.lr * args.lr_factor)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr])
    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
    exp = Experiment("Text autoencoder " + str(args.latent_size),
                     capture_io=False)

    for arg, value in vars(args).items():
        exp.param(arg, value)
    try:
        text_autoencoder.train()

        for epoch in range(start_epoch, args.epochs):
            print("Epoch: {}".format(epoch))
            for steps, batch in enumerate(train_loader):
                torch.cuda.empty_cache()
                feature = Variable(batch).to(device)
                optimizer.zero_grad()
                prob = text_autoencoder(feature)
                loss = criterion(prob.transpose(1, 2), feature)
                loss.backward()
                optimizer.step()
                scheduler.step()

                if (steps * args.batch_size) % args.log_interval == 0:
                    input_data = feature[0]
                    single_data = prob[0]
                    _, predict_index = torch.max(single_data, 1)
                    input_sentence = util.transform_idx2word(
                        input_data.detach().cpu().numpy(),
                        idx2word=word_idx[0])
                    predict_sentence = util.transform_idx2word(
                        predict_index.detach().cpu().numpy(),
                        idx2word=word_idx[0])
                    print("Epoch: {} at {} lr: {}".format(
                        epoch, str(datetime.datetime.now()),
                        str(scheduler.get_lr())))
                    print("Steps: {}".format(steps))
                    print("Loss: {}".format(loss.detach().item()))
                    print("Input Sentence:")
                    print(input_sentence)
                    print("Output Sentence:")
                    print(predict_sentence)
                    del input_data, single_data, _, predict_index
                del feature, prob, loss

            exp.log("\nEpoch: {} at {} lr: {}".format(
                epoch, str(datetime.datetime.now()), str(scheduler.get_lr())))
            _avg_loss, _rouge_1, _rouge_2 = eval_reconstruction_with_rouge(
                text_autoencoder, word_idx[0], criterion, val_loader, device)
            exp.log("\nEvaluation - loss: {}  Rouge1: {} Rouge2: {}".format(
                _avg_loss, _rouge_1, _rouge_2))

            util.save_models(
                {
                    'epoch': epoch + 1,
                    'text_encoder': text_encoder.state_dict(),
                    'text_decoder': text_decoder.state_dict(),
                    'avg_loss': _avg_loss,
                    'Rouge1:': _rouge_1,
                    'Rouge2': _rouge_2,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }, CONFIG.CHECKPOINT_PATH,
                "text_autoencoder_" + str(args.latent_size))

        print("Finish!!!")

    finally:
        exp.end()
def train_reconstruction(args):
    device = torch.device(args.gpu)
    print("Loading dataset...")
    train_dataset, val_dataset = load_imgseq_data(args, CONFIG)
    print("Loading dataset completed")
    train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\
             DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)

    #imgseq_encoder = imgseq_model.RNNEncoder(args.embedding_dim, args.num_layer, args.latent_size, bidirectional=True)
    #imgseq_decoder = imgseq_model.RNNDecoder(CONFIG.MAX_SEQUENCE_LEN, args.embedding_dim, args.num_layer, args.latent_size, bidirectional=True)
    t1 = CONFIG.MAX_SEQUENCE_LEN
    t2 = int(math.floor((t1 - 3) / 1) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - 3) / 1) + 1)
    imgseq_encoder = imgseq_model.ConvolutionEncoder(
        embedding_dim=args.embedding_dim,
        t3=t3,
        filter_size=300,
        filter_shape=3,
        latent_size=1000)
    imgseq_decoder = imgseq_model.DeconvolutionDecoder(
        embedding_dim=args.embedding_dim,
        t3=t3,
        filter_size=300,
        filter_shape=3,
        latent_size=1000)
    if args.resume:
        print("Restart from checkpoint")
        checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                             args.resume),
                                map_location=lambda storage, loc: storage)
        start_epoch = checkpoint['epoch']
        imgseq_encoder.load_state_dict(checkpoint['imgseq_encoder'])
        imgseq_decoder.load_state_dict(checkpoint['imgseq_decoder'])
    else:
        print("Start from initial")
        start_epoch = 0

    imgseq_autoencoder = imgseq_model.ImgseqAutoEncoder(
        imgseq_encoder, imgseq_decoder)
    criterion = nn.MSELoss().to(device)
    imgseq_autoencoder.to(device)

    optimizer = AdamW(imgseq_autoencoder.parameters(),
                      lr=1.,
                      weight_decay=args.weight_decay,
                      amsgrad=True)
    step_size = args.half_cycle_interval * len(train_loader)
    clr = cyclical_lr(step_size,
                      min_lr=args.lr,
                      max_lr=args.lr * args.lr_factor)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr])

    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])

    exp = Experiment("Image-sequence autoencoder " + str(args.latent_size),
                     capture_io=False)

    for arg, value in vars(args).items():
        exp.param(arg, value)
    try:
        imgseq_autoencoder.train()

        for epoch in range(start_epoch, args.epochs):
            print("Epoch: {}".format(epoch))
            for steps, batch in enumerate(train_loader):
                torch.cuda.empty_cache()
                feature = Variable(batch).to(device)
                optimizer.zero_grad()
                feature_hat = imgseq_autoencoder(feature)
                loss = criterion(feature_hat, feature)
                loss.backward()
                optimizer.step()
                scheduler.step()

                if (steps * args.batch_size) % args.log_interval == 0:
                    print("Epoch: {} at {} lr: {}".format(
                        epoch, str(datetime.datetime.now()),
                        str(scheduler.get_lr())))
                    print("Steps: {}".format(steps))
                    print("Loss: {}".format(loss.detach().item()))
                    input_data = feature[0]
                del feature, feature_hat, loss

            exp.log("\nEpoch: {} at {} lr: {}".format(
                epoch, str(datetime.datetime.now()), str(scheduler.get_lr())))
            _avg_loss = eval_reconstruction(imgseq_autoencoder, criterion,
                                            val_loader, device)
            exp.log("\nEvaluation - loss: {}".format(_avg_loss))

            util.save_models(
                {
                    'epoch': epoch + 1,
                    'imgseq_encoder': imgseq_encoder.state_dict(),
                    'imgseq_decoder': imgseq_decoder.state_dict(),
                    'avg_loss': _avg_loss,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }, CONFIG.CHECKPOINT_PATH,
                "imgseq_autoencoder_" + str(args.latent_size))

        print("Finish!!!")

    finally:
        exp.end()
Пример #3
0
def main():

    args = parse_args()

    # set random seed
    #logger.info('> set random seed {}'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)

    # Set up Devices
    #logger.info('> set gpu device {}'.format(args.gpus))
    num_cuda_devices = utils.set_devices(args.gpus)

    # Load model
    #logger.info('> load model {}'.format(args.model_name))
    ext = os.path.splitext(args.model_file)[1]
    model_path = '.'.join(os.path.split(args.model_file)).replace(ext, '')
    model = import_module(model_path)
    model = getattr(model, args.model_name)(args.output_class)
    if num_cuda_devices > 0:
        model = torch.nn.DataParallel(model)
        model.cuda()

    logger.info('> set optimizer')
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.initial_lr,
                          momentum=args.lr_momentum)

    # Create result dir
    result_dir = create_result_dir(args.model_name)

    fh_handler = logging.FileHandler(os.path.join(result_dir, "log"))
    fh_handler.setFormatter(
        logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
    logger.addHandler(fh_handler)

    shutil.copy(args.model_file,
                os.path.join(result_dir, os.path.basename(args.model_file)))
    script_file_list = glob.glob('./*.py') + glob.glob('./*.sh')
    for file_name in script_file_list:
        shutil.copy(file_name,
                    os.path.join(result_dir, os.path.basename(file_name)))
    with open(os.path.join(result_dir, 'args'), 'w') as fp:
        fp.write(json.dumps(vars(args)))
    print(json.dumps(vars(args), sort_keys=True, indent=4))

    # Create Dataset
    logger.info('> Creating DataSet')
    train_transform = partial(transforms.transform_f,
                              random_angle=args.random_angle,
                              expand_ratio=args.expand_ratio,
                              crop_size=args.crop_size,
                              train=True)
    train = getdataset.getCcoreDataset(args.train_json, train_transform,
                                       args.train_mode)

    val_transform = partial(transforms.transform_f,
                            random_angle=args.random_angle,
                            expand_ratio=args.expand_ratio,
                            crop_size=args.crop_size,
                            train=True)
    val = getdataset.getCcoreDataset(args.train_json, val_transform,
                                     args.train_mode)

    # Create DataLoader
    logger.info('> create dataloader')
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=args.batchsize,
                                               shuffle=True,
                                               num_workers=4)
    val_loader = torch.utils.data.DataLoader(val,
                                             batch_size=args.batchsize,
                                             shuffle=False,
                                             num_workers=4)

    # Training
    logger.info('> run training')
    best_prec = 0

    # Create Hyperdash Experiment
    logger.info('> Create Hyperdash Experiment {}'.format(
        args.experiment_name))
    exp = Experiment(args.experiment_name,
                     api_key_getter=utils.get_api_key_from_env)

    for epoch in tqdm(range(args.training_epoch)):

        training_result = training(train_loader, model, criterion, optimizer)
        val_result = validate(val_loader, model, criterion)

        result_str = 'epoch : {} / {}\
        main/loss : {:.3f}\
        main/acc : {:.3f}\
        val/loss : {:.3f}\
        val/acc : {:.3f}'.format(epoch, args.training_epoch,
                                 training_result['loss'],
                                 training_result['acc'], val_result['loss'],
                                 val_result['acc'])
        logger.info(result_str)
        exp.log(result_str)

        prec1 = val_result['acc']

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec
        best_prec = max(prec1, best_prec)
        if is_best:
            save_checkpoint(
                state={
                    'epoch': epoch + 1,
                    #'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec': best_prec,
                    'optimizer': optimizer.state_dict(),
                },
                is_best=is_best,
                result_dir=result_dir)

        exp.metric('main/loss', training_result['loss'])
        exp.metric('val/loss', val_result['loss'])

    logger.info('> end training')
    exp.end()
Пример #4
0
    def test_experiment(self):
        # Run a test job via the Experiment API
        # Make sure log file is where is supposed to be
        # look at decorator
        # verify run start/stop is sent
        with patch("sys.stdout", new=StringIO()) as faked_out:
            exp = Experiment("MNIST")
            exp.log("test print")
            exp.param("batch size", 32)
            for i in exp.iter(2):
                time.sleep(1)
                exp.metric("accuracy", i * 0.2)
            time.sleep(0.1)
            exp.end()

        # Test params match what is expected
        params_messages = []
        for msg in server_sdk_messages:
            payload = msg["payload"]
            if "params" in payload:
                params_messages.append(payload)

        expect_params = [
            {
                "params": {
                    "batch size": 32,
                },
                "is_internal": False,
            },
            {
                "params": {
                    "hd_iter_0_epochs": 2,
                },
                "is_internal": True,
            },
        ]
        assert len(expect_params) == len(params_messages)
        for i, message in enumerate(params_messages):
            assert message == expect_params[i]

        # Test metrics match what is expected
        metrics_messages = []
        for msg in server_sdk_messages:
            payload = msg["payload"]
            if "name" in payload:
                metrics_messages.append(payload)

        expect_metrics = [
            {
                "is_internal": True,
                "name": "hd_iter_0",
                "value": 0
            },
            {
                "is_internal": False,
                "name": "accuracy",
                "value": 0
            },
            {
                "is_internal": True,
                "name": "hd_iter_0",
                "value": 1
            },
            {
                "is_internal": False,
                "name": "accuracy",
                "value": 0.2
            },
        ]
        assert len(expect_metrics) == len(metrics_messages)
        for i, message in enumerate(metrics_messages):
            assert message == expect_metrics[i]

        captured_out = faked_out.getvalue()
        assert "error" not in captured_out

        # Make sure correct API name / version headers are sent
        assert server_sdk_headers[0][API_KEY_NAME] == API_NAME_EXPERIMENT
        assert server_sdk_headers[0][
            VERSION_KEY_NAME] == get_hyperdash_version()

        # Make sure logs were persisted
        expect_logs = [
            "{ batch size: 32 }",
            "test print",
            "| Iteration 0 of 1 |",
            "| accuracy:   0.000000 |",
        ]

        log_dir = get_hyperdash_logs_home_path_for_job("MNIST")
        latest_log_file = max([
            os.path.join(log_dir, filename) for filename in os.listdir(log_dir)
        ],
                              key=os.path.getmtime)
        with open(latest_log_file, "r") as log_file:
            data = log_file.read()
            for log in expect_logs:
                assert_in(log, data)
        os.remove(latest_log_file)
Пример #5
0
def train_reconstruction(args):
    device = torch.device(args.gpu)
    print("Loading dataset...")
    train_dataset, val_dataset = load_image_pretrain_data(args, CONFIG)
    print("Loading dataset completed")
    train_loader, val_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle),\
             DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True)

    # image_encoder = ImageEncoder()
    # image_encoder.init_weights()
    # image_decoder = ImageDecoder()
    image_encoder = ResNet50Encoder()
    image_encoder.init_weights()
    image_decoder = ResNet50Decoder()
    if args.resume:
        print("Restart from checkpoint")
        checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                             args.resume),
                                map_location=lambda storage, loc: storage)
        start_epoch = checkpoint['epoch']
        image_encoder.load_state_dict(checkpoint['image_encoder'])
        image_decoder.load_state_dict(checkpoint['image_decoder'])
    else:
        print("Start from initial")
        start_epoch = 0

    image_autoencoder = ResNet_autoencoder(image_encoder, image_decoder)
    criterion = nn.MSELoss().to(device)
    image_autoencoder.to(device)

    optimizer = AdamW(image_autoencoder.parameters(),
                      lr=1.,
                      weight_decay=args.weight_decay,
                      amsgrad=True)
    step_size = args.half_cycle_interval * len(train_loader)
    clr = cyclical_lr(step_size,
                      min_lr=args.lr,
                      max_lr=args.lr * args.lr_factor)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [clr])

    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])

    exp = Experiment("Image-sequence Component Pretrain " +
                     str(args.latent_size),
                     capture_io=False)

    for arg, value in vars(args).items():
        exp.param(arg, value)
    try:
        image_autoencoder.train()

        for epoch in range(start_epoch, args.epochs):
            print("Epoch: {}".format(epoch))
            for steps, batch in enumerate(train_loader):
                torch.cuda.empty_cache()
                feature = Variable(batch).to(device)
                optimizer.zero_grad()
                feature_hat = image_autoencoder(feature)
                loss = criterion(feature_hat, feature)
                loss.backward()
                optimizer.step()
                scheduler.step()

                if (steps * args.batch_size) % args.log_interval == 0:
                    print("Epoch: {} at {} lr: {}".format(
                        epoch, str(datetime.datetime.now()),
                        str(scheduler.get_lr())))
                    print("Steps: {}".format(steps))
                    print("Loss: {}".format(loss.detach().item()))
                del feature, feature_hat, loss

            exp.log("\nEpoch: {} at {} lr: {}".format(
                epoch, str(datetime.datetime.now()), str(scheduler.get_lr())))
            _avg_loss = eval_reconstruction(image_autoencoder, criterion,
                                            val_loader, device, epoch)
            exp.log("\nEvaluation - loss: {}".format(_avg_loss))

            util.save_models(
                {
                    'epoch': epoch + 1,
                    'image_encoder': image_encoder.state_dict(),
                    'image_decoder': image_decoder.state_dict(),
                    'avg_loss': _avg_loss,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }, CONFIG.CHECKPOINT_PATH,
                "image_pretrain" + str(args.latent_size))

        print("Finish!!!")

    finally:
        exp.end()