示例#1
0
    def _init_dataset(self):
        trainset = get_dataset(mode='train')
        valset = get_dataset(mode='val')

        print('No. of train images: ', len(trainset))
        print('No. of val images: ', len(valset))

        self.batch_size = self.cfg['training']['batch_size']
        kwargs = {
            'num_workers': self.cfg['training']['n_workers'],
            'pin_memory': True
        }

        self.train_queue = data.DataLoader(trainset,
                                           batch_size=self.batch_size,
                                           shuffle=True,
                                           num_workers=8,
                                           drop_last=False,
                                           pin_memory=True)

        self.valid_queue = data.DataLoader(valset,
                                           batch_size=self.batch_size,
                                           num_workers=8,
                                           drop_last=False,
                                           pin_memory=True)
示例#2
0
def main(args):
    # Is GPU usable?
    assert torch.cuda.is_available()

    # load encoder decoder vocab
    logger.debug("Loading Vocabulary...")
    # encoder vocaluraly
    with open(args.en_vocab_path, "rb") as f:
        en_vocab = pickle.load(f)
    logger.debug("Encoder vocab size: {}".format(len(en_vocab)))
    # decoder vocaburaly
    with open(args.de_vocab_path, "rb") as f:
        de_vocab = pickle.load(f)
    logger.debug("Decoder vocab size: {}".format(len(de_vocab)))
    en_size, de_size = len(en_vocab), len(de_vocab)
    logger.debug("[source_vocab]:%d [target_vocab]:%d" % (en_size, de_size))

    # setting train and val dataloader
    logger.debug("Preparing dataset...")
    train_iter = get_dataset(args.train_path, en_vocab, de_vocab,
                             args.batch_size, args.shuffle, args.num_workers)
    val_iter = get_dataset(args.val_path, en_vocab, de_vocab, args.batch_size,
                           args.shuffle, args.num_workers)

    # setting seq2seq model
    logger.debug("Instantiating models...")
    encoder = Encoder(en_size,
                      args.embed_dim,
                      args.hidden_dim,
                      n_layers=args.en_n_layers,
                      dropout=args.en_dropout)
    decoder = Decoder(args.embed_dim,
                      args.hidden_dim,
                      de_size,
                      n_layers=args.de_n_layers,
                      dropout=args.de_dropout)
    seq2seq = Seq2Seq(encoder, decoder).cuda()
    if args.pre_trained_path is not None:
        seq2seq.load_state_dict(torch.load(args.pre_trained_path))
        logger.debug("Load pre trained  model: {0}".format(
            args.pre_trained_path))
    optimizer = optim.Adam(seq2seq.parameters(), lr=args.lr)
    logger.debug(seq2seq)

    # Training and validation model
    best_val_loss = None
    for epoch in range(1, args.epochs + 1):
        train(epoch, seq2seq, optimizer, train_iter, de_size, args.grad_clip,
              en_vocab, de_vocab)
        val_loss = evaluate(seq2seq, val_iter, de_size, en_vocab, de_vocab)
        logger.debug("VAL LOSS: {0:.5f} (epoch={1})".format(val_loss, epoch))

        # Save the model if the validation loss is the best we've seen so far.
        if (best_val_loss is None) or (val_loss < best_val_loss):
            logger.debug("save model (epoch={0})".format(epoch))
            torch.save(seq2seq.state_dict(), args.save_model_path)
            best_val_loss = val_loss
示例#3
0
def train():
    dataset = data.get_dataset(train=True)
    iterator = dataset.make_one_shot_iterator()
    next_element = iterator.get_next()

    siamese = Siamese()
    optimizer = tf.train.AdamOptimizer(FLAGS.lr)
    train_step = optimizer.minimize(siamese.loss)

    tf.summary.scalar('loss', siamese.loss)
    tf.summary.scalar('acc', siamese.accuracy)
    merged_summaries = tf.summary.merge_all()

    saver = tf.train.Saver()
    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter(FLAGS.summaries_dir, sess.graph)
        sess.run(tf.global_variables_initializer())

        for i in trange(FLAGS.n_iters):
            x1, x2, y = sess.run(next_element)
            _, loss, summary = sess.run(
                [train_step, siamese.loss, merged_summaries],
                feed_dict={
                    siamese.x1: x1,
                    siamese.x2: x2,
                    siamese.y: y,
                })
            assert not np.isnan(loss), 'Model diverged with loss = NaN'
            train_writer.add_summary(summary, i)

            if i % 1000 == 0:
                saver.save(sess, FLAGS.model_path)
        print('Training completed, model saved:',
              saver.save(sess, FLAGS.model_path))
    def run(self, use_gpu, learners, params_grid, dataset_dir, result_file, out_dir):
        dataset = get_dataset(self.name, dataset_dir)

        device_type = 'GPU' if use_gpu else 'CPU'

        for LearnerType in learners:
            learner = LearnerType(dataset, self.task, self.metric, use_gpu)
            algorithm_name = learner.name() + '-' + device_type
            print('Started to train ' + algorithm_name)

            for params in ParameterGrid(params_grid):
                params_str = params_to_str(params)
                log_file = os.path.join(out_dir, self.name, algorithm_name, params_str + '.log')

                print(params_str)

                hash_id = Track.hash(self.name, algorithm_name, self.task, params_str)
                if check_exists(hash_id, result_file):
                    print('Skipped: already evaluated')
                    continue

                try:
                    elapsed = learner.run(params, log_file)
                    print('Timing: ' + str(elapsed) + ' sec')

                    track = parse_log(algorithm_name, self.name, self.task, params_str,
                                      log_file, params['iterations'])
                    update_result_file(track, result_file)
                except Exception as e:
                    print('Exception during training: ' + repr(e))
示例#5
0
def plot_principal_component_examples(run_id,
                                      dataset_name,
                                      analysis_tag="",
                                      dset_split="validation",
                                      dset_mode='color_mask_crop',
                                      n_samples_per_component=15,
                                      truncate_n=20,
                                      show_metadata=False):
    analysis_id = run_id if not analysis_tag else "{}_{}".format(
        run_id, analysis_tag)
    pca_examples_savepath = os.path.join(
        cfg.PREDS_DIR,
        "{}_{}_pca_component_examples.json".format(analysis_id, dset_split))
    print("Loading PCA axis examples from: {}".format(pca_examples_savepath))

    with open(pca_examples_savepath) as infile:
        component_examples = json.load(infile)

    ds = get_dataset(dataset_name, dset_mode=dset_mode, one_sample_only=True)
    ds = ds[0] if dset_split == "train" else ds[1]

    def _load_image(imageid):
        return ds.visualize_item(imageid)

    im_grid = []
    metadata_grid = []
    metadata_rows = []

    for component_data in component_examples[:truncate_n]:
        title = "Component {} (Var explained: {}. Sing. value: {})".format(
            component_data['component_i'],
            round(component_data['explained_variance_ratio'], 2),
            round(component_data['singular_value'], 2))
        metadata_rows.append(title)

        examples = component_data['samples_sorted']
        indices_to_sample = np.round(
            np.linspace(0,
                        len(examples) - 1, n_samples_per_component))

        examples = [examples[int(i)] for i in indices_to_sample]

        row_images = []
        row_metadata = []

        for elt in examples:
            coeff, imid = elt
            row_images.append(imid)
            elt_title = "{} ({})".format(
                imid, coeff) if show_metadata else str(round(coeff, 2))
            row_metadata.append(elt_title)

        im_grid.append(row_images)
        metadata_grid.append(row_metadata)

    img_rows_plt(rows=im_grid,
                 metadata=metadata_grid if show_metadata else None,
                 im_load_func=_load_image,
                 row_metadata=metadata_rows if show_metadata else None)
示例#6
0
def data():
    labels, images = get_dataset()
    X_train, X_test, Y_train, Y_test = train_test_split(images,
                                                        labels,
                                                        test_size=0.15)
    X_train = np.repeat(X_train, 10, axis=0)
    Y_train = np.repeat(Y_train, 10, axis=0)
    return X_train, Y_train, X_test, Y_test
示例#7
0
def main(cfg):

    if cfg['mode'] == 'train':
        train_dataset = get_dataset(mode=cfg['mode'], cfg=cfg)
        val_dataset = get_dataset(mode='val', cfg=cfg)
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=cfg['train']['batch_size'],
            num_workers=8,
            shuffle=True,
            collate_fn=collate_remove_none,
            worker_init_fn=worker_init_fn)
        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=cfg['val']['batch_size'],
            num_workers=8,
            shuffle=False,
            collate_fn=collate_remove_none,
            worker_init_fn=worker_init_fn)
        model = get_network(cfg, device='cuda:0', dataset=train_dataset)
    else:
        test_dataset = get_dataset(mode=cfg['mode'], cfg=cfg, return_idx=True)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=1,
                                                  num_workers=4,
                                                  shuffle=False)
        model = get_network(cfg, device='cuda:0', dataset=test_dataset)

    if cfg['mode'] == 'train':
        optimizer = optim.Adam(model.parameters(), lr=1e-4)
    else:
        optimizer = None

    if cfg['mode'] == 'train':
        checkpoint = CheckpointIO(cfg['out']['checkpoint_dir'],
                                  model=model,
                                  optimizer=optimizer)
        load_dict = checkpoint.load(cfg['train']['pretrained'])
        train(train_loader, val_loader, model, optimizer, checkpoint, cfg)
    else:
        checkpoint = CheckpointIO(cfg['out']['checkpoint_dir'], model=model)
        load_dict = checkpoint.load(cfg['test']['pretrained'])
        test(test_loader, test_dataset, model, cfg)
示例#8
0
文件: run.py 项目: hewr1993/nn_expr
def get_data(train_or_test):
    isTrain = train_or_test == 'train'
    ds = data_loader.get_dataset(train_or_test)
    if isTrain:
        augmentors = [
            imgaug.Brightness(15),
            imgaug.Contrast((0.8, 1.2)),
            imgaug.MeanVarianceNormalize(all_channel=True)
        ]
    else:
        augmentors = [imgaug.MeanVarianceNormalize(all_channel=True)]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
    if isTrain:
        ds = PrefetchData(ds, PREFETCH_SIZE, NR_PROC)
    return ds
示例#9
0
文件: run.py 项目: hewr1993/nn_expr
def get_data(train_or_test):
    isTrain = train_or_test == 'train'
    ds = data_loader.get_dataset(train_or_test)
    if isTrain:
        augmentors = [
            imgaug.Brightness(15),
            imgaug.Contrast((0.8, 1.2)),
            imgaug.MeanVarianceNormalize(all_channel=True)
        ]
    else:
        augmentors = [
            imgaug.MeanVarianceNormalize(all_channel=True)
        ]
    ds = AugmentImageComponent(ds, augmentors)
    ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain)
    if isTrain:
        ds = PrefetchData(ds, PREFETCH_SIZE, NR_PROC)
    return ds
示例#10
0
    def run(self, use_gpu, learners, params_grid, dataset_dir, out_dir):
        dataset = get_dataset(self.name, dataset_dir)

        device_type = 'GPU' if use_gpu else 'CPU'

        for LearnerType in learners:
            learner = LearnerType(dataset, self.task, self.metric, use_gpu)
            algorithm_name = learner.name() + '-' + device_type
            print('Started to train ' + algorithm_name)

            for params in ParameterGrid(params_grid):
                print(params)

                log_dir_name = os.path.join(out_dir, self.name, algorithm_name)
                try:
                    elapsed = learner.run(params, log_dir_name)
                    print('Timing: ' + str(elapsed) + ' sec')
                except Exception as e:
                    print('Exception during training: ' + repr(e))
示例#11
0
def plot_nearest_neighbors(run_id,
                           dataset_name,
                           analysis_tag="",
                           dset_split="validation",
                           dset_mode='color_mask_crop',
                           subset_n=100,
                           n_to_show=20,
                           plot_metadata=True):
    analysis_id = run_id if not analysis_tag else "{}_{}".format(
        run_id, analysis_tag)
    nn_savepath = os.path.join(
        cfg.PREDS_DIR,
        "{}_{}_{}_nearest_neighbors.json".format(analysis_id, dset_split,
                                                 subset_n))
    print("Loading nearest neighbor data from: {}".format(nn_savepath))

    with open(nn_savepath) as infile:
        nn_data = json.load(infile)

    ks = list(nn_data.keys())[:n_to_show]
    nn_data = {k: nn_data[k] for k in ks}

    ds = get_dataset(dataset_name, dset_mode=dset_mode, one_sample_only=True)
    ds = ds[0] if dset_split == "train" else ds[1]

    def _load_image(imageid):
        return ds.visualize_item(imageid)

    im_grid = []
    metadata_grid = []

    for probe_item, neighbor_data in nn_data.items():
        row = [elt[0] for elt in neighbor_data]
        row_metadata = [
            "{}: {}".format(elt[0], elt[1]) for elt in neighbor_data
        ]
        im_grid.append(row)
        metadata_grid.append(row_metadata)

    img_rows_plt(rows=im_grid,
                 metadata=metadata_grid if plot_metadata else None,
                 im_load_func=_load_image)
示例#12
0
def get_default_test_loader():
    dataset = get_dataset()
    split = int(0.8 * len(dataset.data))  # train-test split
    test_data = dataset.data[split:]
    test_target = dataset.target[split:]

    # Convert dataset into torch tensors
    test = data_utils.TensorDataset(
        torch.tensor(test_data).float(),
        torch.tensor(test_target).long())

    test_loader = data_utils.DataLoader(
        test,  # dataset to load from
        batch_size=BATCH_SIZE,  # examples per batch (default: 1)
        shuffle=False,
        sampler=
        None,  # if a sampling method is specified, `shuffle` must be False
        num_workers=5,  # subprocesses to use for sampling
        pin_memory=False)  # whether to return an item pinned to GPU
    return test_loader
示例#13
0
        epoch_loss = running_loss / len(test_Y)
        epoch_acc = running_corrects.double() / len(test_Y)
        finaltestletteracc = finaltestletteracc + len(test_Y) * epoch_acc
        if i_batch % 25 == 0:
            print("Letter accuracy =", epoch_acc)

    wtestingepoc.append(wordaccuracies(testpredictedletters,
                                       testactualletters))
    testingepoc.append(finaltestletteracc / len(test))
    print("Testing acc = :", finaltestletteracc / len(test))


#word accuracies function
#gettinig data using DataLoader class , modified code
dataset = get_dataset()


#word accuracies function based on letters, required dataset.nextletter info
def wordaccuracies(pred, actual):
    incorrectwords = 0
    totalwords = 0
    flag = True

    for i in range(len(pred)):

        if pred[i] != actual[i]:
            flag = False
        if dataset.nextletter[split + i] == -1:
            if flag == False:
                incorrectwords += 1
示例#14
0
print_status(torch.cuda.device_count())
print_status('Using CUDA..')

best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

if args.seed != 0:
    torch.manual_seed(args.seed)

# Data
print_status('==> Preparing data..')
if not (args.train_type == 'linear_eval'):
    assert ('wrong train phase...')
else:
    trainloader, traindst, testloader, testdst = data_loader.get_dataset(args)

if args.dataset == 'cifar-10' or args.dataset == 'mnist':
    num_outputs = 10
elif args.dataset == 'cifar-100':
    num_outputs = 100

if args.model == 'ResNet50':
    expansion = 4
else:
    expansion = 1

# Model
print_status('==> Building model..')
train_type = args.train_type
示例#15
0
            writer.add_pr_curve('PR_Curve/test', np.asarray(y_actual_test),
                                np.asarray(y_pred_test))
            print(
                f"Test - Loss: {total_loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}, PCC: {pcc}, Sensitivity: {sensitivity}, PPV: {PPV}, SRCC: {srcc}"
            )

            if epoch % config.ckpt_num == 0:
                torch.save(model.state_dict(), config.model_name)


if __name__ == "__main__":
    torch.manual_seed(3)  # for reproducibility

    device = config.device
    epochs = config.epochs
    dataset_cls, train_loader, val_loader, test_loader, peptide_embedding, mhc_embedding = get_dataset(
        device)
    model = MHCAttnNet(peptide_embedding, mhc_embedding)
    # model.load_state_dict(torch.load(config.model_name))
    model.to(device)
    print(model)
    print('Total parameters', sum(p.numel() for p in model.parameters()))
    print('Trainable parameters',
          sum(p.numel() for p in model.parameters() if p.requires_grad))
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    fit(model=model,
        train_dl=train_loader,
        val_dl=val_loader,
        test_dl=test_loader,
        loss_fn=loss_fn,
示例#16
0
def run(
        model_name='anp',  # choices = ['anp', 'np']
        dataset_name='apollo.npy',
        mask_fname='apollo_train_mask',
        window_size=100,
        sample_size=100,  # set -1 to run vanilla neural processes
        sample_scale_sq=31.25,
        emb_dim=512,
        eps=1e-3,
        fix_eps=150,
        mask_size=5,

        use_rotation_aug=True,
        use_scaling_aug=True,

        learning_rate=1e-4,
        batch_size=512,
        max_epoch=500,
        epoch_split=10,

        # FOR TRAIN
        model_path='',  # model path
        recon_nodata=False,  # reconstruct on no-data gaps

        # FOR RECONSTRUCTION
        # model_path='anp_{EXPERIMENT_ID}.pth',  # model path
        # recon_nodata=True,  # reconstruct on no-data gaps
):
    np.random.seed(7)
    torch.manual_seed(7)

    print(f'Experiment ID: {EXPERIMENT_ID}')
    print(DEVICE)
    if window_size % 2 == 0:
        window_size += 1
    assert window_size > mask_size
    assert sample_size < window_size**2
    center_idx = window_size ** 2 // 2  # index of center pixel

    # loaders
    train_loader, valid_loader, test_loader = get_dataset(
        dataset_name=dataset_name,
        window_size=window_size,
        batch_size=batch_size,
        mask_fname=mask_fname,
        mask_size=mask_size,
        epoch_split=epoch_split,
        recon_nodata=recon_nodata,
    )

    # models
    if model_name == 'np':
        model = NP(
            x_dim=2, y_dim=1, emb_dim=emb_dim,
            dist='Gaussian', stochastic=False,
        )
    elif model_name == 'anp':
        model = ANP(
            x_dim=2, y_dim=1, emb_dim=emb_dim,
            dist='Gaussian', stochastic=False,
        )
    else:
        raise NotImplementedError

    # load model
    if model_path:
        model_path = os.path.join(RESULT_DIR, model_path)
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))

    # initialize model and optimizer
    if recon_nodata:
        max_epoch = 1
    else:
        optimizer = Adam(model.parameters(), lr=learning_rate)

    # initialize variables
    x_grid = tensor(grid(window_size, window_size, scale=2))
    p_grid = tensor(grid(window_size, window_size, scale=(window_size-1)/2))
    p_grid = torch.exp(-1/sample_scale_sq * (p_grid[0, :, 0]**2 + p_grid[0, :, 1]**2))

    best_valid_loss = np.inf
    if recon_nodata:
        loaders = {'test': test_loader}
    else:
        loaders = {'train': train_loader, 'valid': valid_loader, 'test': test_loader}

    for epoch in range(max_epoch):
        for run_type, loader in loaders.items():
            # initialize variables
            if run_type == 'train':
                model.train()
            else:
                model.eval()

            losses = []
            y_true = []
            y_pred = []
            y_sig = []
            idx = []
            tic = time()
            for i, batch_data in enumerate(loader):
                y_context, context_mask, target_value, idx0, idx1 = batch_data
                bs = y_context.size(0)
                x_context = x_grid.expand(bs, -1, -1)

                context_mask[:, center_idx:center_idx + 1] = 0  # mask a center
                non_context = ~context_mask

                if 0 < sample_size:
                    if run_type == 'train':
                        sample_idx = torch.multinomial(p_grid, sample_size, replacement=False)
                        x_context = x_context[:, sample_idx]
                        y_context = y_context[:, sample_idx]
                        context_mask = context_mask[:, sample_idx]
                        non_context = non_context[:, sample_idx]
                    else:
                        sample_idx = []
                        for ib in range(bs):
                            prob = p_grid.clone()
                            prob[non_context[ib]] = 0
                            sample_idx.append(torch.topk(prob, sample_size)[1])
                        sample_idx = torch.cat(sample_idx)
                        batch_idx = torch.arange(bs).unsqueeze(-1).expand(-1, sample_size).flatten()
                        x_context = x_context[batch_idx, sample_idx].view(bs, sample_size, -1)
                        y_context = y_context[batch_idx, sample_idx].view(bs, sample_size, -1)
                        context_mask = context_mask[batch_idx, sample_idx].view(bs, sample_size)
                        non_context = non_context[batch_idx, sample_idx].view(bs, sample_size)

                # scale
                ml = context_mask.sum(dim=1, keepdim=True).unsqueeze(-1)
                y_context[non_context] = 0.0
                mean = y_context.sum(dim=1, keepdim=True) / ml
                scale = (y_context - mean)**2
                scale[non_context] = 0.0
                scale = torch.sqrt(scale.sum(dim=1, keepdim=True) / ml)
                y_context = (y_context - mean) / scale

                # augment
                if run_type == 'train':
                    if use_rotation_aug:
                        theta = torch.rand(
                            bs, 1, 1, dtype=torch.float32, device=DEVICE
                        ) * (math.pi * 2)
                        cth = torch.cos(theta)
                        sth = torch.sin(theta)
                        x_context = torch.cat(
                            (x_context[:, :, 0:1] * cth - x_context[:, :, 1:2] * sth,
                             x_context[:, :, 0:1] * sth + x_context[:, :, 1:2] * cth),
                            dim=-1
                        )
                    if use_scaling_aug:
                        y_scale = torch.rand(
                            bs, 1, 1, dtype=torch.float32, device=DEVICE
                        ) + 0.5
                        y_context *= y_scale
                        scale *= y_scale

                # target value
                x_center = zeros(bs, 1, 2)
                y_target = model(
                    x_context, y_context, context_mask, non_context, x_center
                )
                mu, logvar = torch.chunk(y_target, 2, dim=-1)
                if epoch <= fix_eps:
                    sigma = eps * ones_like(logvar)
                else:
                    sigma = eps + torch.exp(0.5 * logvar)

                # rescale
                mu = mu * scale + mean
                sigma *= scale

                # compute loss and update
                loss = torch.mean(
                    0.5 * ((target_value - mu) / sigma) ** 2 + torch.log(sigma),
                )
                if run_type == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                losses.append(loss.item())
                y_true.append(to_np(target_value))
                y_pred.append(to_np(mu))
                y_sig.append(to_np(sigma))
                idx.append(np.concatenate((to_np(idx0), to_np(idx1)), 1))

            # report results
            y_true = np.concatenate(y_true)
            y_pred = np.concatenate(y_pred)
            y_sig = np.concatenate(y_sig)
            idx = np.concatenate(idx)

            # save sample results
            if recon_nodata:
                fname = os.path.join(
                    RESULT_DIR, f'recon_{EXPERIMENT_ID}.npz'
                )
                np.savez(
                    fname, y_pred=y_pred.flatten(), y_sig=y_sig.flatten(), idx=idx
                )
            else:
                l1_err = np.mean(np.abs(y_true - y_pred))
                rmse = np.sqrt(np.mean((y_true - y_pred)**2))
                loss = np.mean(losses)

                if run_type == 'valid' and loss < best_valid_loss:
                    print('Best !!')
                    best_valid_loss = loss

                    # save model
                    fname = os.path.join(
                        RESULT_DIR, f'{model_name}_{EXPERIMENT_ID}.pth'
                    )
                    torch.save(model.state_dict(), fname)

                report_dict = {
                    'epoch': epoch,
                    f'{run_type}__loss': float(loss.item()),
                    f'{run_type}__l1err': float(l1_err),
                    f'{run_type}__rmse': float(rmse),
                    f'{run_type}__epochtime': float(time() - tic),
                }
                pprint(report_dict)
示例#17
0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.is_available():
    print("cuda")
else:
    print("CPU")

if __name__ == '__main__':
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    # Data Loader
    transformer = ToDouble if args.double else Identity

    train_dataset, test_dataset, num_classes = get_dataset(
        args.dataset, tensor_type_transformer=transformer)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              shuffle=False)

    data_gen = inf_generator(train_loader)
    batches_per_epoch = len(train_loader)

    print("Train Data: {}, Test Data: {}".format(len(train_dataset),
                                                 len(test_dataset)))
    print("Total number of classes: {}".format(num_classes))
示例#18
0
    torch.manual_seed(args.seed)

world_size = args.ngpu
torch.distributed.init_process_group(
    'nccl',
    init_method='env://',
    world_size=world_size,
    rank=args.local_rank,
)

# Data
print_status('==> Preparing data..')
if not (args.train_type == 'contrastive'):
    assert ('wrong train phase...')
else:
    trainloader, traindst, testloader, testdst, train_sampler = data_loader.get_dataset(
        args)

# Model
print_status('==> Building model..')
torch.cuda.set_device(args.local_rank)
model = model_loader.get_model(args)

if args.model == 'ResNet18':
    expansion = 1
elif args.model == 'ResNet50':
    expansion = 4
else:
    assert ('wrong model type')
projector = Projector(expansion=expansion)

if 'Rep' in args.advtrain_type:
示例#19
0
def train(args, device):

    num_client = args.num_client
    trainset, testset = dl.get_dataset(args)
    sample_inds = dl.get_indices(trainset, args)
    # PS model
    net_ps = get_net(args).to(device)

    net_users = [get_net(args).to(device) for u in range(num_client)]

    optimizers = [
        torch.optim.SGD(net_users[cl].parameters(),
                        lr=args.lr,
                        weight_decay=1e-4) for cl in range(num_client)
    ]
    criterions = [nn.CrossEntropyLoss() for u in range(num_client)]
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=args.bs,
                                             shuffle=False,
                                             num_workers=2)
    schedulers = [
        torch.optim.lr_scheduler.StepLR(optimizers[cl],
                                        step_size=30,
                                        gamma=0.1) for cl in range(num_client)
    ]

    # synch all clients models models with PS
    [sf.pull_model(net_users[cl], net_ps) for cl in range(num_client)]

    net_sizes, net_nelements = sf.get_model_sizes(net_ps)
    ind_pairs = sf.get_indices(net_sizes, net_nelements)
    N_s = (50000 if args.dataset_name == 'cifar10' else 60000)
    modelsize = sf.count_parameters(net_ps)
    errorCorCof = 1
    layer_types = []
    for p in net_ps.named_parameters():
        names = p[0]
        layer_types.append(names.split('.'))
    errors = []
    accuracys = []
    ps_model_mask = torch.ones(modelsize).to(device)
    sf.initialize_zero(net_ps)
    currentLR = args.lr
    for cl in range(num_client):
        errors.append(torch.zeros(modelsize).to(device))
    runs = math.ceil(N_s / (args.bs * num_client))

    acc = evaluate_accuracy(net_ps, testloader, device)
    accuracys.append(acc * 100)
    for epoch in tqdm(range(args.num_epoch)):
        if epoch == args.errDecayVals[0] and args.errorDecay is True:
            errorCorCof = args.errDecayVals[1]
        if args.warmUp and epoch < 5:
            for cl in range(num_client):
                for param_group in optimizers[cl].param_groups:
                    if epoch == 0:
                        param_group['lr'] = 0.1
                    else:
                        lr_change = (args.lr - 0.1) / 4
                        param_group['lr'] = (lr_change * epoch) + 0.1
        if epoch in args.lr_change:
            for cl in range(num_client):
                sf.adjust_learning_rate(optimizers[cl], epoch, args.lr_change,
                                        args.lr)
        currentLR = sf.get_LR(optimizers[0])

        for run in range(runs):

            for cl in range(num_client):

                trainloader = DataLoader(dl.DatasetSplit(
                    trainset, sample_inds[cl]),
                                         batch_size=args.bs,
                                         shuffle=True)
                for data in trainloader:
                    inputs, labels = data
                    inputs, labels = inputs.to(device), labels.to(device)
                    optimizers[cl].zero_grad()
                    predicts = net_users[cl](inputs)
                    loss = criterions[cl](predicts, labels)
                    loss.backward()
                    optimizers[cl].step()
                    break
            ps_model_flat = sf.get_model_flattened(net_ps, device)
            ps_model_dif = torch.zeros_like(ps_model_flat)
            for cl in range(num_client):
                model_flat = sf.get_model_flattened(net_users[cl], device)
                model_flat.add_(errors[cl] * currentLR * errorCorCof)
                difmodel = (model_flat.sub(ps_model_flat)).to(device)
                difmodel_clone = torch.clone(difmodel).to(device)

                if not (args.warmUp and epoch < 5):
                    if args.layer_wise_spars and args.worker_LWS:
                        sf.sparse_timeC_alt(difmodel, args.sparsity_window, 10,
                                            args.lws_sparsity_w, ps_model_mask,
                                            ind_pairs, device)
                    else:
                        sf.sparse_timeC(difmodel, args.sparsity_window, 10,
                                        ps_model_mask, device)

                    if args.quantization:
                        sf.groups(difmodel, args.num_groups, args.denominator,
                                  device)
                    errors[cl] = (difmodel_clone.sub(difmodel)) / currentLR

                ps_model_dif.add_(difmodel / num_client)
            ps_model_flat.add_(ps_model_dif)
            topk = math.ceil(ps_model_dif.nelement() / args.sparsity_window)
            ind = torch.topk(ps_model_dif.abs(), k=topk, dim=0)[1]
            if not (args.warmUp and epoch < 5):
                if args.layer_wise_spars:
                    ps_model_mask = sf.sparse_special_mask(
                        ps_model_flat, args.sparsity_window, args.lws_sparsity,
                        ind_pairs, device)
                else:
                    ps_model_mask *= 0
                    ps_model_mask[ind] = 1

            sf.make_model_unflattened(net_ps, ps_model_flat, net_sizes,
                                      ind_pairs)

            [sf.pull_model(net_users[cl], net_ps) for cl in range(num_client)]
            '''
            if run %10 == 0: ##debug
                acc = evaluate_accuracy(net_ps, testloader, device)
                print('accuracy:', acc * 100)
                break
            '''

        acc = evaluate_accuracy(net_ps, testloader, device)
        accuracys.append(acc * 100)
        print(
            'accuracy:',
            acc * 100,
        )
    return accuracys
示例#20
0
                    else:
                        batch_loss = train_step(adj, nodes, targ)
                        print('Epoch {} Batch {} Batch Loss {:.4f} '.format(
                            epoch, batch, batch_loss.numpy()))

                    if batch % args.checkpoint == 0:
                        ckpt_save_path = ckpt_manager.save()
                        print("Saving checkpoint \n")
                    print('Time {} \n'.format(time.time() - start))
                    pbar.update(1)

    elif args.enc_type == 'rnn' and args.dec_type == "rnn":

        OUTPUT_DIR += '/' + args.enc_type + '_' + args.dec_type
        dataset, BUFFER_SIZE, BATCH_SIZE,\
        steps_per_epoch, vocab_inp_size, vocab_tgt_size, target_lang = get_dataset(args)

        step = 0

        if args.decay is not None:
            learning_rate = CustomSchedule(args.emb_dim,
                                           warmup_steps=args.decay_steps)
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                               beta1=0.9,
                                               beta2=0.98,
                                               epsilon=1e-9)
        else:
            optimizer = tf.train.AdamOptimizer(beta1=0.9,
                                               beta2=0.98,
                                               epsilon=1e-9)
示例#21
0
def train():
    print('loading data')
    x, y, x_test, y_test = get_dataset()    # use pca

    n_train = x.shape[0]
    n_test = x_test.shape[0]
    n_batches = n_train // batch_size
    print('train samples: %d' % n_train)
    print('test samples: %d' % n_test)

    x, y, y_test = get_batches(x, y, y_test)

    config = tf.ConfigProto(device_count={'GPU': 1}, allow_soft_placement=True)
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        print('creating model')
        model = create_model(sess)

        start_time = time.time()
        total_loss = []
        train_acc = []
        test_acc = []
        # GO !!
        for e in range(epochs):
            print('working on epoch {0}/{1}'.format(e + 1, epochs))
            epoch_start_time = time.time()
            epoch_loss, epoch_acc = 0, 0

            for i in range(n_batches):
                print('working on epoch {0}, batch {1}/{2}'.format(e+1, i+1, n_batches))
                enc_in, dec_out = x[i], y[i]
                _, output, step_loss, _ = model.step(sess, enc_in, dec_out)
                step_acc = evaluate_batch(output, dec_out) / batch_size
                epoch_loss += step_loss
                epoch_acc += step_acc
                print('current batch loss: {:.2f}'.format(step_loss))

            epoch_time = time.time() - epoch_start_time
            print('epoch {0}/{1} finish in {2:.2f} s'.format(e+1, epochs, epoch_time))
            epoch_loss /= n_batches
            epoch_acc /= n_batches
            total_loss.append(epoch_loss)
            train_acc.append(epoch_acc)
            print('average epoch loss: {:.4f}'.format(epoch_loss))
            print('average epoch acc: {:.4f}'.format(epoch_acc))

            print('saving model...')
            model.saver.save(sess, ckpt_path, model.global_step.eval())

            # test after each epoch
            output = model.step(sess, x_test, y_test, is_train=False)[0]
            step_acc = evaluate_batch(output, y_test, n_test) / n_test
            test_acc.append(step_acc)
            print('test acc: %.4f\n' % step_acc)

        print('training finish in {:.2f} s'.format(time.time() - start_time))

        with open(os.path.join(store_path, 'summary.txt'), 'w') as f:
            for i in range(epochs):
                f.write('{0}\t{1}\t{2}\n'.format(total_loss[i], train_acc[i], test_acc[i]))
示例#22
0
文件: main.py 项目: shahabty/3D-RCNN
from training import get_trainer
from testing import get_tester

cfg = load_config('config.yaml')
is_train = cfg['mode']['train']
is_val = cfg['mode']['val']
is_test = cfg['mode']['test']
mode = 'train' if is_train or is_val else 'test'
device = torch.device("cuda:0" if (
    torch.cuda.is_available() and not cfg[mode]['no_cuda']) else "cpu")
torch.cuda.set_device(device)

if __name__ == '__main__':
    if mode == 'train':
        train_dataset = get_dataset(name=cfg['data']['dataset'],
                                    mode='train',
                                    data_path=cfg['data']['data_path'],
                                    device=device)
        val_dataset = get_dataset(name=cfg['data']['dataset'],
                                  mode='val',
                                  data_path=cfg['data']['data_path'],
                                  device=device)
        train_loader = DataLoader(train_dataset,
                                  batch_size=cfg['train']['batch_size'],
                                  num_workers=4,
                                  shuffle=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=cfg['val']['batch_size'],
                                num_workers=4,
                                shuffle=True)
        #The renderer is implemented separately. Becuase it is replacable and it is not present in inference.
        renderer = get_renderer(name=cfg['train']['model']['renderer'],
示例#23
0
from sklearn import svm
from data_loader import get_dataset

train_x, train_y, test_x, test_y = get_dataset()  # use pca

for penalty in [1, 10, 100]:
    print('C = %d' % penalty)
    for kernel in ['linear', 'rbf']:
        print(kernel)
        clf = svm.SVC(kernel=kernel, C=penalty)
        clf.fit(train_x, train_y)
        print('score: %.4f' % clf.score(test_x, test_y))
        print('support vector: %d' % clf.support_vectors_.shape[0])
示例#24
0
def generate_embedding_vectors(run_id,
                               analysis_tag="",
                               num_workers=20,
                               use_gpu=True,
                               dset_mode=None,
                               dset_split="validation",
                               dataset_name="deepfashion",
                               principal_encoder='1',
                               *useless_args,
                               **useless_kwargs):

    ckpt_path = os.path.join(cfg.CKPT_DIR, "{}.pth".format(run_id))
    print("ckpt path: {}".format(ckpt_path))

    analysis_id = run_id if not analysis_tag else "{}_{}".format(
        run_id, analysis_tag)
    preds_path = os.path.join(
        cfg.PREDS_DIR, "{}_{}_embedding.json".format(analysis_id, dset_split))
    print("preds savepath: {}".format(preds_path))

    if use_gpu:
        if not torch.cuda.is_available():
            raise RuntimeError("cuda not available")
        device = torch.device('cuda')
    else:
        device = torch.device("cpu")

    print('DEVICE', device)

    # load the ckpt
    print("Loading model from path: {}".format(ckpt_path))
    ckpt = torch.load(ckpt_path)
    dset_mode = ckpt['dset_mode'] if dset_mode is None else dset_mode
    model_type = ckpt.get('model_type', 'siamese')

    # model
    model = get_model(model_type,
                      freeze_encoder=True,
                      train_mode=False,
                      principal_encoder=principal_encoder)
    enc_dim = model.enc_dim
    model = nn.DataParallel(model)
    model.load_state_dict(ckpt['model_state_dict'])

    model.to(device)

    print("USING MODEL TYPE {} ON DSET {}".format(model_type, dataset_name))
    print("Using dset mode: {}".format(dset_mode))

    # data loader
    ds = get_dataset(dataset_name, dset_mode, one_sample_only=True)
    ds = ds[0] if dset_split == "train" else ds[1]
    itemids = ds.get_itemids()
    # ds = Subset(ds, range(200))
    dl = DataLoader(ds,
                    batch_size=cfg.BATCH_SIZE,
                    shuffle=False,
                    num_workers=num_workers)

    encodings_arr = np.zeros((len(ds), enc_dim))

    with torch.no_grad():
        for i, x in tqdm(enumerate(dl), total=len(ds) / cfg.BATCH_SIZE):
            x = x.to(device)
            enc = model(x)
            encodings_arr[i * cfg.BATCH_SIZE:(i + 1) *
                          cfg.BATCH_SIZE, :] = enc.cpu().numpy()

    print(encodings_arr.shape)

    encodings = {}

    for i in range(len(encodings_arr)):
        k = itemids[i]
        encoding_vec = encodings_arr[i, :]
        encodings[k] = encoding_vec.tolist()

    # TODO: laod ckpt
    with open(preds_path, "w") as outfile:
        print("Saving preds to: {}".format(preds_path))
        json.dump(encodings, outfile)
示例#25
0
import sys
from data_loader import get_dataset

DATA_NAMES = [
    "abalone", "airline", "airline-one-hot", "epsilon", "higgs", "letters",
    "msrank", "msrank-classification"
]

if __name__ == "__main__":
    out_dir = sys.argv[1]
    print('out_dir: ' + str(out_dir))

    for dataset_name in DATA_NAMES:
        print('Processing ' + dataset_name)
        get_dataset(dataset_name, out_dir)
示例#26
0
def main():
    print("Loading data...\n")

    dataset = get_dataset()
    (train_X, train_Y), (test_X, test_Y) = process_data(dataset)

    # Convert the dataset into torch tensors
    train = data_utils.TensorDataset(
        torch.tensor(train_X).float(),
        torch.tensor(train_Y).long())
    test = data_utils.TensorDataset(
        torch.tensor(test_X).float(),
        torch.tensor(test_Y).long())

    train_loader = data_utils.DataLoader(train,
                                         batch_size=BATCH_SIZE,
                                         shuffle=True,
                                         num_workers=5,
                                         sampler=None,
                                         pin_memory=False)

    test_loader = data_utils.DataLoader(
        test,  # dataset to load from
        batch_size=BATCH_SIZE,  # examples per batch (default: 1)
        shuffle=False,
        sampler=
        None,  # if a sampling method is specified, `shuffle` must be False
        num_workers=5,  # subprocesses to use for sampling
        pin_memory=False)  # whether to return an item pinned to GPU

    # Calculate the word-level accuracy on the training and the test ser
    default_train_loader = get_default_train_loader()
    default_test_loader = get_default_test_loader()

    if args.model == "lenet":
        print("Running LeNet on OCR")
        model = LeNet()
    else:
        print("Running AlexNet on OCR")
        model = AlexNet(num_classes=26)

    model.to(device)

    criterion = nn.CrossEntropyLoss()

    optimizer = optim.LBFGS(model.parameters(), history_size=5, max_iter=5)

    if args.num_epochs is not None:
        NUM_EPOCHS = args.num_epochs
    else:
        NUM_EPOCHS = 100

    print("Starting Training...\n")

    letter_training_accuracies = []
    letter_test_accuracies = []
    word_training_accuracies = []
    word_test_accuracies = []

    for epoch in range(NUM_EPOCHS):
        print("Processing epoch {}".format(epoch + 1))
        running_loss = 0.0

        for i_batch, sample in enumerate(train_loader, 0):
            train_X = sample[0]
            train_Y = sample[1]
            train_X, train_Y = train_X.to(device), train_Y.to(device)
            train_Y_labels = torch.max(train_Y, 1)[1]

            def closure():
                optimizer.zero_grad()
                outputs = model(train_X)
                outputs.to(device)
                tr_loss = criterion(outputs, train_Y_labels)
                print('Loss at epoch {}, batch {}: {}'.format(
                    epoch + 1, i_batch, tr_loss.item()))
                tr_loss.backward()
                del outputs
                return tr_loss

            optimizer.step(closure)

            del train_X, train_Y, train_Y_labels

        # Calculate the letter-level accuracy on the training and the test set
        letter_training_accuracy = letter_accuracy(train_loader, model)
        letter_test_accuracy = letter_accuracy(test_loader, model)
        letter_training_accuracies.append(letter_training_accuracy)
        letter_test_accuracies.append(letter_test_accuracy)

        word_training_accuracy = word_accuracy(default_train_loader, model)
        word_test_accuracy = word_accuracy(default_test_loader, model)
        word_training_accuracies.append(word_training_accuracy)
        word_test_accuracies.append(word_test_accuracy)

        print('\nLetter Training Accuracy on epoch {}: {}'.format(
            epoch + 1, letter_training_accuracy))
        print('Letter Test Accuracy on epoch {}: {}'.format(
            epoch + 1, letter_test_accuracy))
        print('Word Training Accuracy on epoch {}: {}'.format(
            epoch + 1, word_training_accuracy))
        print('Word Training Accuracy on epoch {}: {}\n'.format(
            epoch + 1, word_test_accuracy))

    final_letter_test_accuracy = letter_accuracy(test_loader, model)
    final_word_test_accuracy = word_accuracy(default_test_loader, model)

    print("Letter Test accuracy of {} on OCR Data: {}".format(
        args.model, final_letter_test_accuracy))
    print("Word Test accuracy of {} on OCR Data: {}".format(
        args.model, final_word_test_accuracy))

    save_accuracies(letter_training_accuracies, letter_test_accuracies,
                    "letter", args.model, "lbfgs")
    save_accuracies(word_training_accuracies, word_test_accuracies, "word",
                    args.model, "lbfgs")

    # Save the model
    print("Saving {} model to {}".format(args.model, PATH))
    torch.save(model, PATH)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--use-gpu', action='store_true')
    parser.add_argument('--datasets', default='datasets')
    parser.add_argument('--iterations', default=1000, type=int)
    parser.add_argument('--result', default='result.json')
    parser.add_argument('--table', default='common-table.txt')
    args = parser.parse_args()
    sTime = time.strftime("%m%d_%H_%M_%S", time.localtime())
    #sTime = '{:0>2d}_{:0>2d}_{:0>2d}_{:0>2d}'.format(*time.gmtime()[1:5])
    args.result = f'./result/result_{sTime}.json'

    experiments_names = [
        #'abalone',
        "MICROSOFT",
        "YEAR",
        "YAHOO",
        #"HIGGS",
        #
        #"CLICK",
        #'EPSILON',
        #'airline',
        #'epsilon',
        #'higgs',
        #'letters',
        #'msrank',
        #'msrank-classification',
        #'synthetic',
        #'synthetic-5k-features'
    ]

    #learners = [#XGBoostLearner,LightGBMLearner,#CatBoostLearner    ]

    iterations = args.iterations
    logs_dir = 'logs'

    params_grid = {
        'iterations': [iterations],
        'max_depth': [6],
        'learning_rate': [0.03, 0.07, 0.15]
    }

    #args.datasets = "L:/Datasets/"
    nEXP = len(experiments_names)
    for i, experiment_name in enumerate(experiments_names):
        print(f"\n********************* {experiment_name} {i+1}/{nEXP} ......",
              end="")
        data_tuple, desc = get_dataset(experiment_name, args.datasets)
        print(
            f"\r********************* {experiment_name} {i+1}/{nEXP} *********************\n{desc}"
        )

        experiment = EXPERIMENTS[experiment_name]
        #experiment.run(args.use_gpu, learners, params_grid, args.datasets, args.result, logs_dir)
        experiment.run(args.use_gpu, learners, params_grid, data_tuple,
                       args.result, logs_dir)

    stats = get_experiment_stats(args.result, args.use_gpu, niter=iterations)
    print_all_in_one_table(stats,
                           args.use_gpu,
                           learners,
                           params=(6.0, 1.0),
                           output=args.table)
示例#28
0
def main(verbose=1,
         print_freq=100,
         restore=True,
         ckpt_path=None,
         val_freq=1,
         run_id="model",
         dset_mode="grayscale_mask",
         model_type="siamese",
         dataset_name="deepfashion",
         ckpt_type="siamese",
         freeze_encoder_until_it=1000):

    print("TRAINING MODEL {} ON DATASET {}".format(model_type, dataset_name))

    if restore and ckpt_path:
        raise RuntimeError("Specify restore 0R ckpt_path")

    ckpt_savepath = os.path.join(cfg.CKPT_DIR, "{}.pth".format(run_id))
    print("Saving ckpts to {}".format(ckpt_savepath))
    logs_savepath = os.path.join(cfg.LOGDIR, run_id)
    print("Saving logs to {}".format(logs_savepath))

    if restore or ckpt_path:
        print("Restoring weights from {}".format(
            ckpt_savepath if restore else ckpt_path))

    if cfg.USE_GPU:
        if not torch.cuda.is_available():
            raise RuntimeError("cuda not available")
        device = torch.device('cuda')
    else:
        device = torch.device("cpu")

    print('DEVICE', device)

    # model
    model = get_model(model_type)
    model = DataParallel(model)

    # must call this before constructing the optimizer:
    # https://pytorch.org/docs/stable/optim.html
    model.to(device)

    # set up training
    # TODO better one?
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.01,
                                momentum=0.9,
                                weight_decay=0.0001)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)
    criterion = ContrastiveLoss()

    initial_epoch = 0
    iteration = 0
    unfrozen = False

    if ckpt_path:
        ckpt = torch.load(ckpt_path)
        state_dict = ckpt['model_state_dict']

        if ckpt_type == model_type:
            model.load_state_dict(state_dict)
        elif model_type == 'dual' and ckpt_type == 'siamese':
            model = load_siamese_ckpt_into_dual(model, state_dict)
        else:
            raise NotImplementedError()

    elif restore:
        if os.path.exists(ckpt_savepath):
            print("LOADING MODEL")
            ckpt = torch.load(ckpt_savepath)
            model.load_state_dict(ckpt['model_state_dict'])
            optimizer.load_state_dict(ckpt['optimizer_state_dict'])
            initial_epoch = ckpt['epoch']
            iteration = ckpt['it']
            dset_mode = ckpt.get('dset_mode', dset_mode)

    else:
        raise RuntimeError("Should not get here! Check for bugs")

    print("Using dset_mode {}".format(dset_mode))

    # dataset
    train_ds, test_ds = get_dataset(dataset_name, dset_mode)
    # train_ds = Subset(train_ds, range(500))
    # test_ds = Subset(test_ds, range(100))
    train_dl = DataLoader(train_ds,
                          batch_size=cfg.BATCH_SIZE,
                          shuffle=True,
                          num_workers=cfg.NUM_WORKERS)
    test_dl = DataLoader(test_ds,
                         batch_size=cfg.BATCH_SIZE,
                         shuffle=False,
                         num_workers=cfg.NUM_WORKERS)

    # training loop
    start = time.time()

    try:
        for epoch in range(initial_epoch, cfg.NUM_EPOCHS):
            logger = SummaryWriter(logs_savepath)

            # effectively puts the model in train mode.
            # Opposite of model.eval()
            model.train()

            print("Epoch {}".format(epoch))

            for i, (im1, im2, y) in tqdm(enumerate(train_dl),
                                         total=len(train_ds) / cfg.BATCH_SIZE):
                iteration += 1

                if not unfrozen and iteration > freeze_encoder_until_it:
                    print("Unfreezing encoder")
                    unfrozen = True

                    for param in model.parameters():
                        param.requires_grad = True

                logger.add_scalar('DataTime', time.time() - start, iteration)

                im1 = im1.to(device)
                im2 = im2.to(device)
                y = y.to(device)

                enc1, enc2 = model(im1, im2)
                loss = criterion(enc1, enc2, y)

                # I think this zeros out previous gradients (in case people
                # want to accumulate gradients?)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # logging
                logger.add_scalar('TrainLoss', loss.item(), iteration)
                logger.add_scalar('ItTime', time.time() - start, iteration)
                start = time.time()

                # display metrics

            # do some validation

            if (epoch + 1) % val_freq == 0:
                print("Validating...")
                model.eval()  # puts model in validation mode

                with torch.no_grad():

                    for i, (im1, im2,
                            y) in tqdm(enumerate(test_dl),
                                       total=len(test_ds) / cfg.BATCH_SIZE):
                        im1 = im1.to(device)
                        im2 = im2.to(device)
                        y = y.to(device)

                        enc1, enc2 = model(im1, im2)
                        loss = criterion(enc1, enc2, y)

                        logger.add_scalar('ValLoss', loss, iteration)

            # end of epoch
            lr_scheduler.step()

            save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer,
                      dset_mode, dataset_name, model_type)

    except KeyboardInterrupt:
        print('Got keyboard interrupt, saving model...')
        save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer, dset_mode,
                  dataset_name, model_type)
    f1 = 2. * prec * recall / (prec + recall) if (prec + recall) > 0 else 0
    report = 'Evaluation: F1 %.4f (%.4f %i/%i, %.4f %i/%i, %i)' % \
        (f1, prec, cnt_match, cnt_pred, recall, cnt_match, cnt_label, cnt_length)
    with open(os.path.join(args.save_dir, 'train_log.txt'), 'a') as f:
        f.write(report + '\n')
    print(report)
    if f1 > args.max_f1:
        args.max_f1 = f1
        torch.save(model.state_dict(),
                   os.path.join(args.save_dir, 'f1_%.4f_params.pkl' % f1))


if __name__ == '__main__':

    # prepare dataseut
    train_set, test_set = get_dataset(args)

    # define model
    Model = getattr(models, args.model)
    if args.device > -1:
        os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.device)
        model = Model(args).cuda()

    # record
    localtime = time.asctime(time.localtime(time.time()))
    with open(os.path.join(args.save_dir, 'train_log.txt'), 'a') as f:
        f.write('*********** %s ***********\n' % localtime)
    with open(os.path.join(args.save_dir, 'config.json'), 'wt') as f:
        json.dump(vars(args), f, indent=2)

    # train
示例#30
0
def main(verbose: int = 1,
         print_freq: int = 100,
         restore: Union[bool, str] = True,
         val_freq: int = 1,
         run_id: str = "model",
         dset_name: str = "memento_frames",
         model_name: str = "frames",
         freeze_until_it: int = 1000,
         additional_metrics: Mapping[str, Callable] = {'rc': rc},
         debug_n: Optional[int] = None,
         batch_size: int = cfg.BATCH_SIZE,
         require_strict_model_load: bool = False,
         restore_optimizer=True,
         optim_string='adam',
         lr=0.01) -> None:

    print("TRAINING MODEL {} ON DATASET {}".format(model_name, dset_name))

    ckpt_savedir = os.path.join(cfg.DATA_SAVEDIR, run_id, cfg.CKPT_DIR)
    print("Saving ckpts to {}".format(ckpt_savedir))
    logs_savepath = os.path.join(cfg.DATA_SAVEDIR, run_id, cfg.LOGDIR)
    print("Saving logs to {}".format(logs_savepath))
    utils.makedirs([ckpt_savedir, logs_savepath])
    last_ckpt_path = os.path.join(ckpt_savedir, "last_model.pth")

    device = utils.set_device()

    print('DEVICE', device)

    # model
    model = get_model(model_name, device)
    # print("model", model)
    model = DataParallel(model)

    # must call this before constructing the optimizer:
    # https://pytorch.org/docs/stable/optim.html
    model.to(device)

    # set up training
    # TODO better one?

    if optim_string == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    elif optim_string == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=lr,
                                    momentum=0.9,
                                    weight_decay=0.0001)
    else:
        raise RuntimeError(
            "Unrecognized optimizer string {}".format(optim_string))

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=5,
                                                   gamma=0.1)
    # criterion = MemAlphaLoss(device=device)
    # criterion = MemMSELoss()
    # criterion = lambda x, y: MemMSELoss()(x, y) +
    # CaptionsLoss(device=device)(x, y)
    losses = {
        'mem_mse':
        MemMSELoss(device=device, weights=np.load("memento_weights.npy")),
        'captions':
        CaptionsLoss(device=device,
                     class_weights=cap_utils.get_vocab_weights())
    }

    initial_epoch = 0
    iteration = 0
    unfrozen = False

    if restore:
        ckpt_path = restore if isinstance(restore, str) else last_ckpt_path

        if os.path.exists(ckpt_path):

            print("Restoring weights from {}".format(ckpt_path))

            ckpt = torch.load(ckpt_path)
            utils.try_load_state_dict(model, ckpt['model_state_dict'],
                                      require_strict_model_load)

            if restore_optimizer:
                utils.try_load_optim_state(optimizer,
                                           ckpt['optimizer_state_dict'],
                                           require_strict_model_load)
            initial_epoch = ckpt['epoch']
            iteration = ckpt['it']
    else:
        ckpt_path = last_ckpt_path

    # dataset
    train_ds, val_ds, test_ds = get_dataset(dset_name)
    assert val_ds or test_ds

    if debug_n is not None:
        train_ds = Subset(train_ds, range(debug_n))
        test_ds = Subset(test_ds, range(debug_n))

    train_dl = DataLoader(train_ds,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=cfg.NUM_WORKERS)
    test_dl = DataLoader(test_ds,
                         batch_size=batch_size,
                         shuffle=False,
                         num_workers=cfg.NUM_WORKERS)

    # training loop
    start = time.time()

    try:
        for epoch in range(initial_epoch, cfg.NUM_EPOCHS):
            logger = SummaryWriter(logs_savepath)

            # effectively puts the model in train mode.
            # Opposite of model.eval()
            model.train()

            print("Epoch {}".format(epoch))

            for i, (x, y_) in tqdm(enumerate(train_dl),
                                   total=len(train_ds) / batch_size):

                y: ModelOutput[MemModelFields] = ModelOutput(y_)
                iteration += 1

                if not unfrozen and iteration > freeze_until_it:
                    print("Unfreezing encoder")
                    unfrozen = True

                    for param in model.parameters():
                        param.requires_grad = True

                logger.add_scalar('DataTime', time.time() - start, iteration)

                x = x.to(device)
                y = y.to_device(device)

                out = ModelOutput(model(x, y.get_data()))
                loss_vals = {name: l(out, y) for name, l in losses.items()}
                # print("loss_vals", loss_vals)
                loss = torch.stack(list(loss_vals.values()))

                if verbose:
                    print("stacked loss", loss)
                loss = loss.sum()
                # loss = criterion(out, y)

                # I think this zeros out previous gradients (in case people
                # want to accumulate gradients?)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # logging
                utils.log_loss(logger, loss, loss_vals, iteration)
                logger.add_scalar('ItTime', time.time() - start, iteration)
                start = time.time()

                # display metrics

            # do some validation

            if (epoch + 1) % val_freq == 0:
                print("Validating...")
                model.eval()  # puts model in validation mode
                val_iteration = iteration

                with torch.no_grad():

                    labels: Optional[ModelOutput[MemModelFields]] = None
                    preds: Optional[ModelOutput[MemModelFields]] = None
                    val_losses = []

                    for i, (x, y_) in tqdm(enumerate(test_dl),
                                           total=len(test_ds) / batch_size):
                        val_iteration += 1

                        y = ModelOutput(y_)
                        y_numpy = y.to_numpy()

                        labels = y_numpy if labels is None else labels.merge(
                            y_numpy)

                        x = x.to(device)
                        y = y.to_device(device)

                        out = ModelOutput(model(x, y.get_data()))
                        out_numpy = out.to_device('cpu').to_numpy()
                        preds = out_numpy if preds is None else preds.merge(
                            out_numpy)

                        loss_vals = {
                            name: l(out, y)
                            for name, l in losses.items()
                        }
                        loss = torch.stack(list(loss_vals.values())).sum()
                        utils.log_loss(logger,
                                       loss,
                                       loss_vals,
                                       val_iteration,
                                       phase='val')

                        val_losses.append(loss)

                    print("Calculating validation metric...")
                    # print("preds", {k: v.shape for k, v in preds.items()})
                    # assert False
                    metrics = {
                        fname: f(labels, preds, losses)
                        for fname, f in additional_metrics.items()
                    }
                    print("Validation metrics", metrics)

                    for k, v in metrics.items():
                        if isinstance(v, numbers.Number):
                            logger.add_scalar('Metric_{}'.format(k), v,
                                              iteration)

                    metrics['total_val_loss'] = sum(val_losses)

                    ckpt_path = os.path.join(
                        ckpt_savedir, utils.get_ckpt_path(epoch, metrics))
                    save_ckpt(ckpt_path, model, epoch, iteration, optimizer,
                              dset_name, model_name, metrics)

            # end of epoch
            lr_scheduler.step()

            save_ckpt(last_ckpt_path, model, epoch, iteration, optimizer,
                      dset_name, model_name)

    except KeyboardInterrupt:
        print('Got keyboard interrupt, saving model...')
        save_ckpt(last_ckpt_path, model, epoch, iteration, optimizer,
                  dset_name, model_name)
示例#31
0
                    help='Beta 1 for Adam optimizer')
parser.add_argument('--lr', type=float, default=0.0002, help='Learning rate')
parser.add_argument('--epochs',
                    type=int,
                    default=20,
                    help='Number of iterations to train')
parser.add_argument('--feature_size',
                    type=int,
                    default=100,
                    help='Size of random noise')

# Parse all the arguments
args = parser.parse_args()

# Get dataset and data loader
dataset, num_channels = get_dataset(args.dataset)
data_loader = DataLoader(dataset,
                         batch_size=args.batch_size,
                         shuffle=True,
                         num_workers=2)

# Check whether GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate generator and discriminator
generator = Generator(args.feature_size, num_channels).to(device)
discriminator = Discriminator(num_channels=num_channels).to(device)

# Select loss function and optimizer
loss_fn = torch.nn.BCELoss()
optimizer_d = optim.Adam(discriminator.parameters(),