Exemplo n.º 1
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--data_path", type=str, default="data.joblib")
    parser.add_argument("--test_strat", type=int, default=0)
    parser.add_argument("--device_id", type=int, default=0)
    parser.add_argument("--num_epochs_s2cnn", type=int, default=30)
    parser.add_argument("--num_epochs_mlp", type=int, default=30)
    parser.add_argument("--batch_size_s2cnn", type=int, default=32)
    parser.add_argument("--batch_size_mlp", type=int, default=32)
    parser.add_argument("--init_learning_rate_s2cnn", type=int, default=1e-3)
    parser.add_argument("--learning_rate_mlp", type=int, default=1e-3)
    parser.add_argument("--learning_rate_decay_epochs", type=int, default=10)

    args = parser.parse_args()

    torch.cuda.set_device(args.device_id)

    print("evaluating on {}".format(args.test_strat))

    print("loading data...", end="")
    data, train_idxs, test_idxs = load_data(args.data_path,
                                            args.test_strat,
                                            cuda=args.device_id)
    print("done!")

    mlp = BaselineRegressor()
    s2cnn = S2CNNRegressor()

    if torch.cuda.is_available():
        for model in [mlp, s2cnn]:
            model.cuda(args.device_id)

    print("training baseline model")
    print("mlp #params: {}".format(count_params(mlp)))
    train_baseline(
        mlp, data,
        IndexBatcher(train_idxs, args.batch_size_mlp, cuda=args.device_id),
        IndexBatcher(test_idxs, args.batch_size_mlp, cuda=args.device_id),
        args.num_epochs_mlp, args.learning_rate_mlp, args.device_id)

    print("training residual s2cnn model")
    print("s2cnn #params: {}".format(count_params(s2cnn)))
    train_s2cnn(
        mlp, s2cnn, data,
        IndexBatcher(train_idxs, args.batch_size_s2cnn, cuda=args.device_id),
        IndexBatcher(test_idxs, args.batch_size_s2cnn, cuda=args.device_id),
        args.num_epochs_s2cnn, args.init_learning_rate_s2cnn,
        args.learning_rate_decay_epochs, args.device_id)
def test_conv(Base, nruns=100, device='cuda'):
    # will do basic sanity check, we want to get same spatial
    # dimension with custom convolutions as in standard module
    # to swap convolution types conveniently
    chi, cho, k, s = 8, 32, 3, 1

    x = torch.randn(16, chi, 512, 512)
    conv = Base(chi, cho, k, s, autopad(k))
    conv_ = nn.Conv2d(chi, cho, k, s, autopad(k))

    if 'cuda' in device:
        assert torch.cuda.is_available()
        conv.cuda().train()
        conv_.cuda().train()
        x = x.cuda()

        if torch.backends.cudnn.benchmark:
            # have to do warm up iterations for fair comparison
            print('benchmark warm up...')
            for _ in range(50):
                _ = conv(x)
    else:
        conv.cpu().train()
        conv_.cpu().train()
        nruns = 1

    p = count_params(conv)
    p_ = count_params(conv_)
    # relative number of parameter change in brackets w.r.t. nn.conv2d
    print(f'Number of parameters: {p} ({p / p_ * 100:.2f}%)')

    # ensure same behaviour as standard module
    out = conv(x)
    out_ = conv_(x)
    assert out.shape == out_.shape, f'Shape missmatch, should be {out_.shape} but is {out.shape}'

    # g0 = torch.randn_like(out)
    # performance test without feature/target loading
    # because that would require a significant amount of overhead
    start = time_synchronized()
    for _ in range(nruns):
        out = conv(x)
        for param in conv.parameters():
            param.grad = None
        out.mean().backward()  # out.backward(g0)
    end = time_synchronized()

    print(f'Forward + Backward time: {(end - start) * 1000 / nruns:.3f}ms')
Exemplo n.º 3
0
def test_generator(generator, true_count):
    tf.reset_default_graph()
    with get_session() as sess:
        y = generator(tf.ones((1, 96)), 3)
        cur_count = count_params()
        if cur_count != true_count:
            print(
                'Incorrect number of parameters in generator. {0} instead of {1}. Check your achitecture.'
                .format(cur_count, true_count))
        else:
            print('Correct number of parameters in generator.')
Exemplo n.º 4
0
        x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True)
        x = self.tcn3(x)

        out = x
        out_channels = out.size(1)
        out = out.view(N, M, out_channels, -1)
        out = out.mean(3)  # Global Average Pooling (Spatial+Temporal)
        out = out.mean(1)  # Average pool number of bodies in the sequence

        out = self.fc(out)
        return out


if __name__ == "__main__":
    # For debugging purposes
    import sys
    sys.path.append('..')

    model = Model(num_class=60,
                  num_point=25,
                  num_person=2,
                  num_gcn_scales=13,
                  num_g3d_scales=6,
                  graph='graph.ntu_rgb_d.AdjMatrixGraph')

    N, C, T, V, M = 6, 3, 50, 25, 2
    x = torch.randn(N, C, T, V, M)
    model.forward(x)

    print('Model total # params:', count_params(model))
Exemplo n.º 5
0
    def build(self):
        """ Builds a multi-tower model
        """
        with tf.device('/cpu:0'):
            assert self.batch_size % self.num_gpus == 0, (
                'Batch size must be divisible by number of GPUs')

            with tf.name_scope('Input_splits'):
                tower_inputs = [[] for i in range(self.num_gpus)]
                for inp in self.Inputs:
                    splits = tf.split(inp, self.num_gpus, name=inp.name[:-2])
                    for i, s in enumerate(splits):
                        tower_inputs[i].append(s)

            tower_outputs = []
            tower_losses = []
            tower_grads = []
            with tf.variable_scope(tf.get_variable_scope()):
                for i in range(self.num_gpus):
                    with tf.device('/gpu:%d' % i):
                        with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                            # Calculate the loss for one tower of the model. This function
                            # constructs the entire model but shares the variables across
                            # all towers.
                            outputs, losses, grads = self._build_train_tower(
                                tower_inputs[i],
                                int(self.batch_size / self.num_gpus),
                                reuse=i > 0 or self.model_built)

                            # Reuse variables for the next tower.
                            tf.get_variable_scope().reuse_variables()

                            # Save summaries from tower_1
                            if i == 0:
                                summaries = tf.get_collection(
                                    tf.GraphKeys.SUMMARIES, scope)

                            tower_outputs.append(outputs)
                            tower_losses.append(losses)
                            tower_grads.append(grads)

            with tf.name_scope('Concat_outputs'):
                outputs = [[] for _ in tower_outputs[0]]
                for t_outputs in tower_outputs:
                    for i, output in enumerate(t_outputs):
                        outputs[i].append(output)
                self.outputs = []
                for outs in outputs:
                    self.outputs.append(tf.concat(outs, 0))

            with tf.name_scope('Concat_losses'):
                losses = [[] for _ in range(len(tower_losses[0]))]
                for t_losses in tower_losses:
                    for i, loss in enumerate(t_losses):
                        losses[i].append(loss)

            with tf.name_scope('Average_grads'):
                var_grads = [[] for _ in range(len(tower_grads[0]))]
                for t_grads in tower_grads:
                    for i, grad in enumerate(t_grads):
                        var_grads[i].append(grad)
                avg_grads = []
                for v_grads in var_grads:
                    avg_grads.append(ops.average_gradients(v_grads))

            if self.grad_summ:
                # Add histograms for gradients.
                with tf.name_scope('Grad_summary'):
                    grads_summ = []
                    for var_grads in avg_grads:
                        for grad, var in var_grads:
                            if grad is not None:
                                grads_summ.append(
                                    tf.summary.histogram(
                                        self._remove_tower_name_prefix(var) +
                                        '/Grads', grad))
                    summaries.append(tf.summary.merge(grads_summ))

            if self.var_summ:
                # Add histograms for trainable variables.
                t_vars = tf.trainable_variables()
                with tf.name_scope('Var_summary'):
                    vars_summ = []
                    for var in t_vars:
                        vars_summ.append(
                            tf.summary.histogram(
                                self._remove_tower_name_prefix(var), var))
                    summaries.append(tf.summary.merge(vars_summ))

            summaries += self.additional_summaries()

            self._tower_outputs(self.outputs)
            self._build_train_ops(losses, avg_grads)
            self.summary_op = tf.summary.merge(summaries, name='summary_op')
            self.saver = tf.train.Saver()
            self.model_built = True
            utils.count_params()
Exemplo n.º 6
0
def main():
    args = parse_args()
    #args.dataset = "datasets"

    if args.name is None:
        if args.deepsupervision:
            args.name = '%s_%s_wDS' % (args.dataset, args.arch)
        else:
            args.name = '%s_%s_woDS' % (args.dataset, args.arch)
    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('%s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('%s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    # define loss function (criterion)
    if args.loss == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda()
    else:
        criterion = losses.__dict__[args.loss]().cuda()

    cudnn.benchmark = True

    # Data loading code
    img_paths = glob(
        r'D:\Project\CollegeDesign\dataset\Brats2018FoulModel2D\trainImage\*')
    mask_paths = glob(
        r'D:\Project\CollegeDesign\dataset\Brats2018FoulModel2D\trainMask\*')

    train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \
        train_test_split(img_paths, mask_paths, test_size=0.2, random_state=41)
    print("train_num:%s" % str(len(train_img_paths)))
    print("val_num:%s" % str(len(val_img_paths)))

    # create model
    print("=> creating model %s" % args.arch)
    model = FCN.__dict__[args.arch](args)

    model = model.cuda()

    print(count_params(model))

    if args.optimizer == 'Adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay,
                              nesterov=args.nesterov)

    train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug)
    val_dataset = Dataset(args, val_img_paths, val_mask_paths)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               pin_memory=True,
                                               drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             drop_last=False)

    log = pd.DataFrame(
        index=[],
        columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

    best_iou = 0
    trigger = 0
    for epoch in range(args.epochs):
        print('Epoch [%d/%d]' % (epoch, args.epochs))

        # train for one epoch
        train_log = train(args, train_loader, model, criterion, optimizer,
                          epoch)
        # evaluate on validation set
        val_log = validate(args, val_loader, model, criterion)

        print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' %
              (train_log['loss'], train_log['iou'], val_log['loss'],
               val_log['iou']))

        tmp = pd.Series(
            [
                epoch,
                args.lr,
                train_log['loss'],
                train_log['iou'],
                val_log['loss'],
                val_log['iou'],
            ],
            index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

        log = log.append(tmp, ignore_index=True)
        log.to_csv('models/%s/log.csv' % args.name, index=False)

        trigger += 1

        if val_log['iou'] > best_iou:
            torch.save(model.state_dict(), 'models/%s/model.pth' % args.name)
            best_iou = val_log['iou']
            print("=> saved best model")
            trigger = 0

        # early stopping
        if not args.early_stop is None:
            if trigger >= args.early_stop:
                print("=> early stopping")
                break

        torch.cuda.empty_cache()
        inverse_model.load_state_dict(torch.load('./models/inverse_model.pth')['model_state_dict'])
        model = TandemNet(forward_model, inverse_model)
        optimizer = torch.optim.Adam(model.inverse_model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay'])
        
    elif args.model in ['vae']:
        model = cVAE(configs['input_dim'], configs['latent_dim']).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay'])

    elif args.model in ['gan']:
        model = cGAN(configs['input_dim'], configs['output_dim'], configs['noise_dim']).to(DEVICE)
        model.apply(weights_init_normal)

        optimizer_G = torch.optim.Adam(model.generator.parameters(), lr=configs['g_learning_rate'], weight_decay=configs['weight_decay'])
        optimizer_D = torch.optim.Adam(model.discriminator.parameters(), lr=configs['d_learning_rate'], weight_decay=configs['weight_decay'])

        print('Model {}, Number of parameters {}'.format(args.model, count_params(model)))
        criterion = torch.nn.BCELoss()
        trainer = GANTrainer(model, optimizer_G, optimizer_D, train_loader, val_loader, test_loader, criterion, configs['epochs'], args.model)
        trainer.fit()
        sys.exit(0)

    elif args.model in ['inn']:
        
        model = INN(configs['ndim_total'], configs['input_dim'], configs['output_dim'], dim_z = configs['latent_dim']).to(DEVICE)
        print('Model {}, Number of parameters {}'.format(args.model, count_params(model)))
        optimizer = torch.optim.Adam(model.parameters(), lr=configs['learning_rate'], weight_decay=configs['weight_decay'])

        criterion = torch.nn.MSELoss()
        trainer = INNTrainer(model, optimizer, train_loader, val_loader, test_loader, criterion, configs['epochs'], args.model)
        trainer.fit()
Exemplo n.º 8
0
        self.outconvm1 = nn.Conv2d(64, out_chan, 1)

    def forward(self, x):
        x, y1 = self.down1(x)
        x, y2 = self.down2(x)
        x, y3 = self.down3(x)
        x, y4 = self.down4(x)
        x = F.dropout2d(F.relu(self.bn1(self.conv1(x))))
        x = F.dropout2d(F.relu(self.bn2(self.conv2(x))))
        x = self.up4(x, y4)
        x = self.up3(x, y3)
        x = self.up2(x, y2)
        x = self.up1(x, y1)
        x1 = self.outconv(x)

        return x1


if __name__ == '__main__':

    model = Unet(nn.Module)
    args = None
    # create model
    device = 'cpu'
    models = model.to(device)
    # solution: 1
    model = models.cpu()
    summary(model, (4, 160, 160))
    # print(model)
    print(count_params(model))
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    # Model and data are required
    parser.add_argument(
        "--dir_pretrained_model",
        type=str,
        required=True,
        help=
        "Dir containing pre-trained model (checkpoint), which may have been fine-tuned already."
    )

    # Required for certain modes (--resume, --do_train, --eval_during_training, --do_eval or --do_pred)
    parser.add_argument(
        "--dir_train",
        type=str,
        help=
        ("Dir containing training data (n files named <lang>.train containing unlabeled text)"
         ))
    parser.add_argument(
        "--dir_output",
        type=str,
        help=
        "Directory in which model will be written (required if --do_train (but not --resume) or --do_pred)"
    )
    parser.add_argument(
        "--path_dev",
        type=str,
        help="Path of 2-column TSV file containing labeled validation examples."
    )
    parser.add_argument(
        "--path_test",
        type=str,
        required=False,
        help="Path of text file containing unlabeled test examples.")
    # Execution modes
    parser.add_argument(
        "--resume",
        action="store_true",
        help=
        "Resume training model in --dir_pretrained_model (note: --dir_output will be ignored)"
    )
    parser.add_argument("--do_train", action="store_true", help="Run training")
    parser.add_argument("--eval_during_training",
                        action="store_true",
                        help="Run evaluation on dev set during training")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Evaluate model on dev set")
    parser.add_argument("--do_pred",
                        action="store_true",
                        help="Run prediction on test set")

    # Score to optimize on dev set (by early stopping)
    parser.add_argument(
        "--score_to_optimize",
        choices=["track1", "track2", "track3"],
        default="track3",
        help="Score to optimize on dev set during training (by early stopping)."
    )

    # Hyperparameters
    parser.add_argument(
        "--freeze_encoder",
        action="store_true",
        help=
        "Freeze weights of pre-trained encoder. (Note: in this case, we do not keep doing MLM.)"
    )
    parser.add_argument(
        "--no_mlm",
        action="store_true",
        help=
        "Do not keep doing masked language modeling (MLM) during fine-tuning.")
    parser.add_argument(
        "--sampling_alpha",
        type=float,
        default=1.0,
        help=
        "Dampening factor for relative frequencies used to compute language sampling probabilities"
    )
    parser.add_argument(
        "--weight_relevant",
        type=float,
        default=1.0,
        help=
        "Relative sampling frequency of relevant languages wrt irrelevant languages"
    )
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for evaluation.")
    parser.add_argument(
        "--seq_len",
        default=128,
        type=int,
        help=
        "Length of input sequences. Shorter seqs are padded, longer ones are trucated"
    )
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for AdamW optimizer.")
    parser.add_argument("--equal_betas",
                        action='store_true',
                        help="Use beta1=beta2=0.9 for AdamW optimizer.")
    parser.add_argument(
        "--correct_bias",
        action='store_true',
        help=
        "Correct bias in AdamW optimizer (correct_bias=False is meant to reproduce BERT behaviour exactly."
    )
    parser.add_argument(
        "--max_train_steps",
        default=1000000,
        type=int,
        help=
        "Maximum number of training steps to perform. Note: # optimization steps = # train steps / # accumulation steps."
    )
    parser.add_argument(
        "--num_train_steps_per_epoch",
        default=1000,
        type=int,
        help=
        "Number of training steps that equals one epoch. Note: # optimization steps = # train steps / # accumulation steps."
    )
    parser.add_argument(
        '--grad_accum_steps',
        type=int,
        default=1,
        help=
        "Number of training steps (i.e. batches) to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        "--num_gpus",
        type=int,
        default=-1,
        help="Num GPUs to use for training (0 for none, -1 for all available)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    # Distributed or parallel?
    if args.local_rank != -1 or args.num_gpus > 1:
        raise NotImplementedError(
            "No distributed or parallel training available at the moment.")
    if torch.cuda.is_available():
        args.device = torch.device("cuda")
        args.n_gpu = 1
    else:
        args.device = torch.device("cpu")
        args.n_gpu = 0

    # Check execution mode
    assert args.resume or args.do_train or args.do_eval or args.do_pred
    if args.resume:
        assert not args.do_train
        assert not args.do_eval
        assert not args.do_pred

    # Load checkpoint. This contains a pre-trained model which may or
    # may not have been fine-tuned for language identification already
    logger.info("Loading checkpoint...")
    checkpoint_path = os.path.join(args.dir_pretrained_model, "checkpoint.tar")
    checkpoint_data = torch.load(checkpoint_path)
    if args.resume:
        # Check progress
        logger.info("Resuming training. Currently at training step %d" %
                    checkpoint_data["global_step"])
        # Replace args with initial args for this job, except for
        # num_gpus, seed and model directory
        current_num_gpus = args.n_gpu
        current_dir_pretrained_model = args.dir_pretrained_model
        args = deepcopy(checkpoint_data["initial_args"])
        args.num_gpus = current_num_gpus
        args.dir_pretrained_model = current_dir_pretrained_model
        args.resume = True
        logger.info("Args (most have been reloaded from checkpoint): %s" %
                    args)
    else:
        if args.eval_during_training:
            assert args.do_train
        if args.do_train or args.do_pred:
            assert args.dir_output is not None
            if os.path.exists(args.dir_output) and os.path.isdir(
                    args.dir_output) and len(os.listdir(args.dir_output)) > 1:
                msg = "%s already exists and is not empty" % args.dir_output
                raise ValueError(msg)
            if not os.path.exists(args.dir_output):
                os.makedirs(args.dir_output)
        if args.do_train:
            assert args.dir_train is not None
            train_paths = glob.glob(os.path.join(args.dir_train, "*.train"))
            assert len(train_paths) > 0
            checkpoint_data["initial_args"] = args
        if args.do_train and args.freeze_encoder and not args.no_mlm:
            logger.warning(
                "Setting --no_mlm to True since --freeze_encoder is True, therefore doing MLM would be pointless."
            )
            args.no_mlm = True
    if args.do_eval or args.eval_during_training:
        assert args.path_dev is not None
        assert os.path.exists(args.path_dev)
    if args.do_pred:
        assert args.path_test is not None
        assert os.path.exists(args.path_test)
    if args.grad_accum_steps < 1:
        raise ValueError(
            "Invalid grad_accum_steps parameter: {}, should be >= 1".format(
                args.grad_accum_steps))

    # Create list of languages we handle
    lang_list = sorted(ALL_LANGS)

    # Seed RNGs
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Load tokenizer
    logger.info("Loading tokenizer...")
    tokenizer_path = os.path.join(args.dir_pretrained_model, "tokenizer.pkl")
    with open(tokenizer_path, "rb") as f:
        tokenizer = pickle.load(f)

    # Make encoder and model
    logger.info("Making encoder...")
    encoder_config = BertConfig.from_json_file(
        os.path.join(args.dir_pretrained_model, "config.json"))
    encoder = BertForMaskedLM(encoder_config)
    logger.info("Making model...")
    model = BertForLangID(encoder, lang_list)
    model.to(args.device)

    # Load model weights. First, check if we just have an encoder, or a previously fine-tuned model
    if "classifier.dense.weight" in checkpoint_data["model_state_dict"]:
        if "best_model_state_dict" in checkpoint_data and not args.resume:
            logger.info("Loading model weights from 'best_model_state_dict'")
            model.load_state_dict(checkpoint_data["best_model_state_dict"])
        else:
            logger.info("Loading model weights from 'model_state_dict'")
            model.load_state_dict(checkpoint_data["model_state_dict"])
    else:
        # Model has not previously been fine-tuned, so we only load encoder weights
        assert args.do_train
        logger.info("Loading encoder weights from 'model_state_dict'")
        model.encoder.load_state_dict(checkpoint_data["model_state_dict"])
    if (args.do_train or args.resume) and args.freeze_encoder:
        model.freeze_encoder()

    # Write encoder config and tokenizer in output directory
    if (not args.resume) and args.do_train:
        path_config = os.path.join(args.dir_output, "config.json")
        model.encoder.config.to_json_file(path_config)
        path_tokenizer = os.path.join(args.dir_output, "tokenizer.pkl")
        with open(path_tokenizer, "wb") as f:
            pickle.dump(tokenizer, f)

    # Log some info on the model
    logger.info("Encoder config: %s" % repr(model.encoder.config))
    logger.info("Model params:")
    for n, p in model.named_parameters():
        msg = "  %s" % n
        if not p.requires_grad:
            msg += " ***FROZEN***"
        logger.info(msg)
    logger.info("Nb model params: %d" % count_params(model))
    logger.info("Nb params in encoder: %d" % count_params(model.encoder))
    logger.info("Nb params in pooler: %d" % count_params(model.pooler))
    logger.info("Nb params in classifier: %d" % count_params(model.classifier))

    # Get data
    max_seq_length = args.seq_len + 2  # We add 2 for CLS and SEP
    if args.resume:
        # Reload training dataset(s)
        logger.info("Reloading training data from checkpoint")
        train_dataset = checkpoint_data["train_dataset"]
        train_dataset.prep_files_for_streaming()
        dev_dataset = checkpoint_data.get("dev_dataset", None)
        unk_dataset = checkpoint_data.get("unk_dataset", None)
        if unk_dataset:
            unk_dataset.prep_files_for_streaming()
    elif args.do_train:
        # Remove unk.train if present, and create a MLM dataset for it.
        path_unk = check_for_unk_train_data(train_paths)
        if path_unk is None:
            unk_dataset = None
        else:
            train_paths.remove(path_unk)
            logger.info("Creating MLM-only training set from %s..." % path_unk)
            unk_dataset = BertDatasetForMLM(
                [path_unk],
                tokenizer,
                max_seq_length,
                sampling_alpha=args.sampling_alpha,
                weight_relevant=args.weight_relevant,
                encoding="utf-8",
                seed=args.seed,
                verbose=DEBUG)

        logger.info("Creating training set from %s training files in %s..." %
                    (len(train_paths), args.dir_train))
        train_dataset = BertDatasetForClassification(
            train_paths,
            tokenizer,
            max_seq_length,
            include_mlm=True,
            sampling_alpha=args.sampling_alpha,
            weight_relevant=args.weight_relevant,
            encoding="utf-8",
            seed=args.seed,
            verbose=DEBUG)
        if path_unk is not None:
            assert len(unk_dataset) == len(train_dataset)
        # Check train_dataset.lang2id: keys should contain all langs, and nothing else, like that of the model
        assert train_dataset.lang2id == model.lang2id
    if not args.resume:
        dev_dataset = None
        if args.do_eval or args.eval_during_training:
            logger.info("Loading validation data from %s..." % args.path_dev)
            dev_dataset = BertDatasetForTesting(args.path_dev,
                                                tokenizer,
                                                model.lang2id,
                                                max_seq_length,
                                                require_labels=True,
                                                encoding="utf-8",
                                                verbose=DEBUG)
        if args.do_train and args.eval_during_training:
            checkpoint_data["dev_dataset"] = dev_dataset
        if args.do_pred:
            logger.info("Loading test data from %s..." % args.path_test)
            test_dataset = BertDatasetForTesting(args.path_test,
                                                 tokenizer,
                                                 model.lang2id,
                                                 max_seq_length,
                                                 require_labels=False,
                                                 encoding="utf-8",
                                                 verbose=DEBUG)

    # Compute number of epochs and steps, initialize number of training steps done.
    num_opt_steps_per_epoch = args.num_train_steps_per_epoch // args.grad_accum_steps
    args.num_epochs = math.ceil(checkpoint_data["max_opt_steps"] /
                                num_opt_steps_per_epoch)
    if args.do_train and (not args.resume):
        checkpoint_data["global_step"] = 0
        checkpoint_data[
            "max_opt_steps"] = args.max_train_steps // args.grad_accum_steps

    # Training
    if args.do_train or args.resume:
        # Prepare optimizer
        logger.info("Preparing optimizer...")
        np_list = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        opt_params = [{
            'params':
            [p for n, p in np_list if not any(nd in n for nd in no_decay)],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in np_list if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if args.equal_betas:
            betas = (0.9, 0.9)
        else:
            betas = (0.9, 0.999)
        optimizer = AdamW(
            opt_params,
            lr=args.learning_rate,
            betas=betas,
            correct_bias=args.correct_bias
        )  # To reproduce BertAdam specific behaviour, use correct_bias=False

        # Load optimizer state if resuming
        if args.resume:
            optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"])

        # Log some info before training
        logger.info("*** Training info: ***")
        logger.info("  Number of training steps completed: %d" %
                    checkpoint_data["global_step"])
        logger.info("  Max training steps: %d" % args.max_train_steps)
        logger.info("  Gradient accumulation steps: %d" %
                    args.grad_accum_steps)
        logger.info("  Max optimization steps: %d" %
                    checkpoint_data["max_opt_steps"])
        logger.info("  Training dataset size: %d" % len(train_dataset))
        logger.info("  Batch size: %d" % args.train_batch_size)
        logger.info("  # training steps/epoch: %d" %
                    (args.num_train_steps_per_epoch))
        logger.info("  # optimization steps/epoch: %d" %
                    num_opt_steps_per_epoch)
        logger.info("  # epochs to do: %d" % args.num_epochs)
        if args.eval_during_training:
            logger.info("Validation dataset size: %d" % len(dev_dataset))

        # Run training
        train(model,
              optimizer,
              train_dataset,
              args,
              checkpoint_data,
              dev_dataset=dev_dataset,
              unk_dataset=unk_dataset)
        # Reload model
        save_to_dir = args.dir_pretrained_model if args.resume else args.dir_output
        checkpoint_data = torch.load(
            os.path.join(save_to_dir, "checkpoint.tar"))
        if "best_model_state_dict" in checkpoint_data:
            model.load_state_dict(checkpoint_data["best_model_state_dict"])
        else:
            model.load_state_dict(checkpoint_data["model_state_dict"])

    # Evaluate model on dev set
    if args.do_eval:
        logger.info("*** Running evaluation... ***")
        scores = evaluate(model, dev_dataset, args)
        logger.info("***** Evaluation Results *****")
        for score_name in sorted(scores.keys()):
            logger.info("- %s: %.4f" % (score_name, scores[score_name]))

    # Get model's predictions on test set
    if args.do_pred:
        logger.info("*** Running prediction... ***")
        logits = predict(model, test_dataset, args)
        pred_class_ids = np.argmax(logits.cpu().numpy(), axis=1)
        pred_labels = [test_dataset.label_list[i] for i in pred_class_ids]
        path_pred = os.path.join(args.dir_output, "pred.txt")
        logger.info("Writing predictions in %s..." % path_pred)
        with open(path_pred, 'w', encoding="utf-8") as f:
            for x in pred_labels:
                f.write("%s\n" % x)
Exemplo n.º 10
0
    def run(self):
        # load data
        args_dict = self._default_configs()
        args = dotdict(args_dict)
        feature_dirs, label_dirs = get_data(datadir, level, train_dataset,
                                            dev_dataset, test_dataset, mode)
        batchedData, maxTimeSteps, totalN = self.load_data(
            feature_dirs[0], label_dirs[0], mode, level)
        model = model_fn(args, maxTimeSteps)
        FL_pair = list(zip(feature_dirs, label_dirs))
        random.shuffle(FL_pair)
        feature_dirs, label_dirs = zip(*FL_pair)
        print("Feature dirs:", feature_dirs)
        for feature_dir, label_dir in zip(feature_dirs, label_dirs):
            id_dir = feature_dirs.index(feature_dir)
            print('dir id:{}'.format(id_dir))
            batchedData, maxTimeSteps, totalN = self.load_data(
                feature_dir, label_dir, mode, level)
            model = model_fn(args, maxTimeSteps)
            num_params = count_params(model, mode='trainable')
            all_num_params = count_params(model, mode='all')
            model.config['trainable params'] = num_params
            model.config['all params'] = all_num_params
            print(model.config)
            with tf.Session(graph=model.graph) as sess:
                # restore from stored model
                if keep == True:
                    ckpt = tf.train.get_checkpoint_state(savedir)
                    if ckpt and ckpt.model_checkpoint_path:
                        model.saver.restore(sess, ckpt.model_checkpoint_path)
                        print('Model restored from:' + savedir)
                else:
                    print('Initializing')
                    sess.run(model.initial_op)
                total_cont = 0
                for epoch in range(num_epochs):
                    ## training
                    start = time.time()
                    if mode == 'train':
                        print('Epoch {} ...'.format(epoch + 1))
                    batchErrors = np.zeros(len(batchedData))
                    batchRandIxs = np.random.permutation(len(batchedData))
                    for batch, batchOrigI in enumerate(batchRandIxs):
                        batchInputs, batchTargetSparse, batchSeqLengths = batchedData[
                            batchOrigI]
                        batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse
                        feedDict = {
                            model.inputX: batchInputs,
                            model.targetIxs: batchTargetIxs,
                            model.targetVals: batchTargetVals,
                            model.targetShape: batchTargetShape,
                            model.seqLengths: batchSeqLengths
                        }

                        _, l, pre, y, er = sess.run([
                            model.optimizer, model.loss, model.predictions,
                            model.targetY, model.errorRate
                        ],
                                                    feed_dict=feedDict)
                        batchErrors[batch] = er
                        print(
                            '\n{} mode, total:{},subdir:{}/{},batch:{}/{},epoch:{}/{},train loss={:.3f},mean train CER={:.3f}\n'
                            .format(level, totalN, id_dir + 1,
                                    len(feature_dirs), batch + 1,
                                    len(batchRandIxs), epoch + 1, num_epochs,
                                    l, er / batch_size))
                        total_cont += 1
                        if batch % 20 == 0:
                            print('Truth:\n' +
                                  output_to_sequence(y, type=level))
                            print('Output:\n' +
                                  output_to_sequence(pre, type=level))
                            checkpoint_path = os.path.join(
                                savedir, 'model.ckpt')
                            model.saver.save(sess,
                                             checkpoint_path,
                                             global_step=total_cont)
                            print('Model has been saved in {}'.format(savedir))

                    end = time.time()
                    delta_time = end - start
                    print('Epoch ' + str(epoch + 1) + ' needs time:' +
                          str(delta_time) + ' s')
    def train(self, args):
        ''' import data, train model, save model
	'''
        text_parser = TextParser(args.data_dir, args.batch_size,
                                 args.seq_length)
        args.vocab_size = text_parser.vocab_size
        ckpt = tf.train.get_checkpoint_state(args.save_dir)

        if args.keep is True:
            # check if all necessary files exist
            if os.path.exists(os.path.join(args.save_dir,'config.pkl')) and \
         os.path.exists(os.path.join(args.save_dir,'words_vocab.pkl')) and \
         ckpt and ckpt.model_checkpoint_path:
                with open(os.path.join(args.save_dir, 'config.pkl'),
                          'rb') as f:
                    saved_model_args = cPickle.load(f)
                with open(os.path.join(args.save_dir, 'words_vocab.pkl'),
                          'rb') as f:
                    saved_words, saved_vocab = cPickle.load(f)
            else:
                raise ValueError('configuration doesn"t exist!')

        if args.model == 'seq2seq_rnn':
            model = Model_rnn(args)
        else:
            # TO ADD OTHER MODEL
            pass
        trainable_num_params = count_params(model, mode='trainable')
        all_num_params = count_params(model, mode='all')
        args.num_trainable_params = trainable_num_params
        args.num_all_params = all_num_params
        print(args.num_trainable_params)
        print(args.num_all_params)
        with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
            cPickle.dump(args, f)
        with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f:
            cPickle.dump((text_parser.vocab_dict, text_parser.vocab_list), f)

        with tf.Session() as sess:
            if args.keep is True:
                print('Restoring')
                model.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                print('Initializing')
                sess.run(model.initial_op)

            for e in range(args.num_epochs):
                start = time.time()
                #sess.run(tf.assign(model.lr, args.learning_rate * (args.decay_rate ** e)))
                sess.run(tf.assign(model.lr, args.learning_rate))
                model.initial_state = tf.convert_to_tensor(model.initial_state)
                state = model.initial_state.eval()
                total_loss = []
                for b in range(text_parser.num_batches):
                    x, y = text_parser.next_batch()
                    print('flag')
                    feed = {
                        model.input_data: x,
                        model.targets: y,
                        model.initial_state: state
                    }
                    train_loss, state, _ = sess.run(
                        [model.cost, model.final_state, model.train_op], feed)
                    total_loss.append(train_loss)
                    print("{}/{} (epoch {}), train_loss = {:.3f}" \
                                .format(e * text_parser.num_batches + b, \
                                args.num_epochs * text_parser.num_batches, \
                                e, train_loss))
                    if (e * text_parser.num_batches +
                            b) % args.save_every == 0 or (
                                e == args.num_epochs - 1
                                and b == text_parser.num_batches - 1):
                        checkpoint_path = os.path.join(args.save_dir,
                                                       'model.ckpt')
                        model.saver.save(sess, checkpoint_path, global_step=e)
                        print("model has been saved in:" +
                              str(checkpoint_path))
                end = time.time()
                delta_time = end - start
                ave_loss = np.array(total_loss).mean()
                logging(model, ave_loss, e, delta_time, mode='train')
                if ave_loss < 0.5:
                    break
Exemplo n.º 12
0
def main():
    args = parse_args()

    # add model name to args
    if args.name is None:
        args.name = '%s_%s' % (args.arch, datetime.now().strftime('%m%d%H%M'))
    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('%s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('%s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    if args.seed is not None:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    # define loss function (criterion)
    if args.loss == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda(args.gpu)
    else:
        criterion = losses.__dict__[args.loss]().cuda(args.gpu)

    cudnn.benchmark = True

    # Data loading code
    train_df = pd.read_csv('input/train.csv')
    img_paths = 'input/train/images/' + train_df['id'].values + '.png'
    mask_paths = 'input/train/masks/' + train_df['id'].values + '.png'

    if args.cv == 'KFold':
        kf = KFold(n_splits=args.n_splits, shuffle=True, random_state=41)
        cv = kf.split(img_paths)
    elif args.cv == 'Cov':
        train_df['cov'] = 0
        for i in tqdm(range(len(train_df))):
            mask = imread('input/train/masks/' + train_df['id'][i] + '.png')
            mask = mask.astype('float32') / 255
            train_df.loc[i, 'cov'] = ((np.sum(mask > 0.5) / 101**2) *
                                      10).astype('int')
        skf = StratifiedKFold(n_splits=args.n_splits,
                              shuffle=True,
                              random_state=41)
        cv = skf.split(img_paths, train_df['cov'])

    for fold, (train_idx, val_idx) in enumerate(cv):
        print('Fold [%d/%d]' % (fold + 1, args.n_splits))

        # create model
        print("=> creating model %s (pretrained=%s)" %
              (args.arch, str(args.pretrained)))
        model = archs.__dict__[args.arch](args)
        if args.freeze_bn:
            model.freeze_bn()

        if args.gpu is not None:
            model = model.cuda(args.gpu)
        else:
            model = torch.nn.DataParallel(model).cuda()

        print(count_params(model))

        if args.optimizer == 'Adam':
            optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          model.parameters()),
                                   lr=args.lr)
        elif args.optimizer == 'SGD':
            optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                         model.parameters()),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay,
                                  nesterov=args.nesterov)

        if args.scheduler == 'MultiStepLR':
            if args.reduce_epoch is None:
                scheduler = lr_scheduler.MultiStepLR(optimizer,
                                                     milestones=[],
                                                     gamma=0.1)
            else:
                scheduler = lr_scheduler.MultiStepLR(
                    optimizer, milestones=[args.reduce_epoch], gamma=0.1)
        elif args.scheduler == 'CyclicLR':
            scheduler = CyclicLR(optimizer, step_size=800)
        elif args.scheduler == 'CosineAnnealingLR':
            scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                       T_max=args.epochs,
                                                       eta_min=args.min_lr)
        elif args.scheduler == 'StepLR':
            scheduler = lr_scheduler.StepLR(optimizer, 20, gamma=0.5)

        train_img_paths, val_img_paths = img_paths[train_idx], img_paths[
            val_idx]
        train_mask_paths, val_mask_paths = mask_paths[train_idx], mask_paths[
            val_idx]

        train_dataset = Dataset(args, train_img_paths, train_mask_paths)
        val_dataset = Dataset(args, val_img_paths, val_mask_paths, False)

        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   pin_memory=True,
                                                   drop_last=True)
        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 pin_memory=True,
                                                 drop_last=False)

        log = pd.DataFrame(
            index=[],
            columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

        best_loss = float('inf')
        trigger = 0
        for epoch in range(args.epochs):

            if args.scheduler == 'CyclicLR':
                # train for one epoch
                train_log = train(args, train_loader, model, criterion,
                                  optimizer, epoch, scheduler)
            else:
                scheduler.step()
                # train for one epoch
                train_log = train(args, train_loader, model, criterion,
                                  optimizer, epoch)

            # evaluate on validation set
            val_log = validate(args, val_loader, model, criterion)

            tmp = pd.Series(
                [
                    epoch,
                    scheduler.get_lr()[0],
                    train_log['loss'],
                    train_log['iou'],
                    val_log['loss'],
                    val_log['iou'],
                ],
                index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

            log = log.append(tmp, ignore_index=True)
            log.to_csv('models/%s/log_%d.csv' % (args.name, fold + 1),
                       index=False)

            trigger += 1

            if val_log['loss'] < best_loss:
                torch.save(model.state_dict(),
                           'models/%s/model_%d.pth' % (args.name, fold + 1))
                best_loss = val_log['loss']
                print("=> saved best model")
                trigger = 0

            # early stopping
            if not args.early_stop is None:
                if epoch > args.epochs * 0.8 and trigger >= args.early_stop:
                    print("=> early stopping")
                    break

        torch.cuda.empty_cache()
    def run(self):
        # load data
        args_dict = self._default_configs()
        args = dotdict(args_dict)  # 创建dotdict类,类似创造自己的dict
        feature_dirs, label_dirs = get_data(datadir, level, train_dataset,
                                            dev_dataset, test_dataset, mode)

        # batchedData, maxTimeSteps, totalN = self.load_data(feature_dirs[0], label_dirs[0], mode, level)
        # model = model_fn(args, maxTimeSteps)
        # # 此两行作用不明白,删掉后不知道有什么影响

        # 记录每次epoch的
        # shuffle feature_dir and label_dir by same order
        FL_pair = list(zip(feature_dirs,
                           label_dirs))  # zip()后返回特定zip数据?,list让其变成列表
        random.shuffle(FL_pair)  # 打乱列表中元素顺序
        feature_dirs, label_dirs = zip(*FL_pair)

        for feature_dir, label_dir in zip(
                feature_dirs, label_dirs):  # zip()返回结果可用于for, 展示时用list()展出
            id_dir = feature_dirs.index(feature_dir)
            print('dir id:{}'.format(id_dir))
            batchedData, maxTimeSteps, totalN = self.load_data(
                feature_dir, label_dir, mode, level)

            model = model_fn(args, maxTimeSteps)  # 建立神经网络的图

            num_params = count_params(model, mode='trainable')
            all_num_params = count_params(model, mode='all')
            model.config['trainable params'] = num_params
            model.config['all params'] = all_num_params
            print(model.config)

            with tf.Session(graph=model.graph, config=config) as sess:
                # restore from stored model
                if keep:  # 用于重新训练 keep == True
                    ckpt = tf.train.get_checkpoint_state(savedir)
                    # Returns CheckpointState proto from the "checkpoint" file.
                    if ckpt and ckpt.model_checkpoint_path:  # The checkpoint file
                        model.saver.restore(sess, ckpt.model_checkpoint_path)
                        print('Model restored from:' + savedir)
                else:
                    print('Initializing')
                    sess.run(model.initial_op)

                for step in range(num_steps):
                    # training
                    start = time.time()
                    if mode == 'train':
                        print('step {} ...'.format(step + 1))

                    batchErrors = np.zeros(len(batchedData))
                    batchRandIxs = np.random.permutation(len(batchedData))
                    # 如果传给permutation一个矩阵,它会返回一个洗牌后的矩阵副本

                    for batch, batchOrigI in enumerate(batchRandIxs):
                        # 对于一个可迭代的(iterable)/可遍历的对象(如列表、字符串),enumerate将其组成一个索引序列,
                        # 利用它可以同时获得索引和值          这部分代码用于feed_Dict
                        batchInputs, batchTargetSparse, batchSeqLengths = batchedData[
                            batchOrigI]
                        batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse
                        feedDict = {
                            model.inputX: batchInputs,
                            model.targetIxs: batchTargetIxs,
                            model.targetVals: batchTargetVals,
                            model.targetShape: batchTargetShape,
                            model.seqLengths: batchSeqLengths
                        }

                        if level == 'cha':
                            if mode == 'train':
                                _, l, pre, y, er = sess.run([
                                    model.optimizer, model.loss,
                                    model.predictions, model.targetY,
                                    model.errorRate
                                ],
                                                            feed_dict=feedDict)

                                batchErrors[
                                    batch] = er  # batchError 207 batch 211

                                print(
                                    '\n{} mode, total:{},subdir:{}/{},batch:{}/{},step:{},train loss={:.3f},mean '
                                    'train CER={:.3f}, epoch: {}\n'.format(
                                        level, totalN, id_dir + 1,
                                        len(feature_dirs), batch + 1,
                                        len(batchRandIxs), step + 1, l,
                                        er / batch_size, num_epochs))

                            elif mode == 'dev':
                                l, pre, y, er = sess.run([
                                    model.loss, model.predictions,
                                    model.targetY, model.errorRate
                                ],
                                                         feed_dict=feedDict)
                                batchErrors[batch] = er
                                print(
                                    '\n{} mode, total:{},subdir:{}/{},batch:{}/{},dev loss={:.3f},'
                                    'mean dev CER={:.3f}\n'.format(
                                        level, totalN, id_dir + 1,
                                        len(feature_dirs), batch + 1,
                                        len(batchRandIxs), l, er / batch_size))

                            elif mode == 'test':
                                l, pre, y, er = sess.run([
                                    model.loss, model.predictions,
                                    model.targetY, model.errorRate
                                ],
                                                         feed_dict=feedDict)
                                batchErrors[batch] = er
                                print(
                                    '\n{} mode, total:{},subdir:{}/{},batch:{}/{},test loss={:.3f},'
                                    'mean test CER={:.3f}\n'.format(
                                        level, totalN, id_dir + 1,
                                        len(feature_dirs), batch + 1,
                                        len(batchRandIxs), l, er / batch_size))
                        elif level == 'seq2seq':
                            raise ValueError('level %s is not supported now' %
                                             str(level))

                        # NOTE: ??????for what
                        # if er / batch_size == 1.0:
                        #     break

                        if batch % 20 == 0:
                            print('Truth:\n' +
                                  output_to_sequence(y, type=level))
                            print('Output:\n' +
                                  output_to_sequence(pre, type=level))

                        if mode == 'train' and (
                            (step * len(batchRandIxs) + batch + 1) % 20 == 0 or
                            (step == num_steps - 1
                             and batch == len(batchRandIxs) - 1)):
                            # 每当算式结果是20倍数 或者 跑完一个 subdir的 batch后, 记录model
                            checkpoint_path = os.path.join(
                                savedir, 'model.ckpt')
                            model.saver.save(sess,
                                             checkpoint_path,
                                             global_step=step)
                            print('Model has been saved in {}'.format(savedir))

                    end = time.time()
                    delta_time = end - start
                    print('subdir ' + str(id_dir + 1) + ' needs time:' +
                          str(delta_time) + ' s')

                    if mode == 'train':
                        if (step + 1) % 1 == 0:
                            checkpoint_path = os.path.join(
                                savedir, 'model.ckpt')
                            model.saver.save(sess,
                                             checkpoint_path,
                                             global_step=step)
                            print('Model has been saved in {}'.format(savedir))
                        epochER = batchErrors.sum() / totalN
                        print('subdir', id_dir + 1, 'mean train error rate:',
                              epochER)  # 修改epoch成subdir
                        logging(model,
                                logfile,
                                epochER,
                                id_dir,
                                delta_time,
                                mode='config')
                        logging(model,
                                logfile,
                                epochER,
                                id_dir,
                                delta_time,
                                mode=mode)

                    if mode == 'test' or mode == 'dev':
                        with open(
                                os.path.join(resultdir, level + '_result.txt'),
                                'a') as result:
                            result.write(
                                output_to_sequence(y, type=level) + '\n')
                            result.write(
                                output_to_sequence(pre, type=level) + '\n')
                            result.write('\n')
                        epochER = batchErrors.sum() / totalN
                        print(' test error rate:', epochER)
                        logging(model, logfile, epochER, mode=mode)
Exemplo n.º 14
0
def main():
    args = parse_args()

    if args.name is None:
        if args.deepsupervision:
            args.name = '%s_%s_wDS' % (args.dataset, args.arch)
        else:
            args.name = '%s_%s_woDS' % (args.dataset, args.arch)
    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    writer = SummaryWriter('models/%s/test' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('%s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('%s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    # define loss function (criterion)
    if args.loss == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda()
    else:
        criterion = losses.__dict__[args.loss]().cuda()

    cudnn.benchmark = True

    # Data loading code
    img_paths = glob('input/' + args.dataset + '/images/*')
    mask_paths = glob('input/' + args.dataset + '/masks/*')

    train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \
        train_test_split(img_paths, mask_paths, test_size=0.2, random_state=41)

    # create model
    print("=> creating model %s" % args.arch)
    model = archs.__dict__[args.arch](args)

    model = model.cuda()
    # print(type(model))
    ######## model visualization in tensorboard ##############
    dummy_input = torch.rand(1, 3, 256, 256).cuda()
    writer.add_graph(model, (dummy_input, ))

    print(count_params(model))

    if args.optimizer == 'Adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay,
                              nesterov=args.nesterov)

    train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug)
    val_dataset = Dataset(args, val_img_paths, val_mask_paths)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               pin_memory=True,
                                               drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             drop_last=False)

    log = pd.DataFrame(
        index=[],
        columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

    best_iou = 0
    trigger = 0
    for epoch in range(args.epochs):
        print('Epoch [%d/%d]' % (epoch, args.epochs))

        # changing lr
        adjust_learning_rate(args, optimizer, epoch, writer)
        # optimizer

        # train for one epoch
        train_log = train(args, train_loader, model, criterion, optimizer,
                          epoch)
        # evaluate on validation set
        val_log = validate(args, val_loader, model, criterion)

        print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' %
              (train_log['loss'], train_log['iou'], val_log['loss'],
               val_log['iou']))

        # vis in tensorboard
        writer.add_scalar('train_loss', train_log['loss'], epoch)
        writer.add_scalar('train_iou', train_log['iou'], epoch)
        writer.add_scalar('val_loss', val_log['loss'], epoch)
        writer.add_scalar('val_iou', val_log['iou'], epoch)

        tmp = pd.Series(
            [
                epoch,
                args.lr,
                train_log['loss'],
                train_log['iou'],
                val_log['loss'],
                val_log['iou'],
            ],
            index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

        log = log.append(tmp, ignore_index=True)
        log.to_csv('models/%s/log.csv' % args.name, index=False)

        trigger += 1

        if val_log['iou'] > best_iou:
            torch.save(model.state_dict(), 'models/%s/model.pth' % args.name)
            best_iou = val_log['iou']
            print("=> saved best model")
            trigger = 0

        #todo: save model with best dice

        # early stopping
        if not args.early_stop is None:
            if trigger >= args.early_stop:
                print("=> early stopping")
                break

        torch.cuda.empty_cache()
Exemplo n.º 15
0
CHAR_EMBEDDING_MATRIX = model.add_lookup_parameters(
    (num_chars, char_embedding_size))

word_lstm = dy.LSTMBuilder(num_layers, embedding_size, lstm_out, model)
char_fw_lstm = dy.LSTMBuilder(1, char_embedding_size, char_lstm_out, model)
char_bw_lstm = dy.LSTMBuilder(1, char_embedding_size, char_lstm_out, model)

softmax_w = model.add_parameters((num_words, lstm_out))
softmax_b = model.add_parameters((num_words))

params = [CHAR_EMBEDDING_MATRIX, softmax_w, softmax_b]
params.extend(*word_lstm.get_parameters())
params.extend(*char_fw_lstm.get_parameters())
params.extend(*char_bw_lstm.get_parameters())
print("Number of Params: {}".format(count_params(params)))


def word_rep(word, fw_init, bw_init):
    pad = char_vocab.get('<*>')
    indices = [pad] + [char_vocab.get(c) for c in word] + [pad]
    embedded = [CHAR_EMBEDDING_MATRIX[i] for i in indices]
    forward = fw_init.transduce(embedded)
    backward = bw_init.transduce(embedded)
    return dy.concatenate([forward[-1], backward[-1]])


def calc_lm_loss(words):
    dy.renew_cg()

    sm_w = dy.parameter(softmax_w)
Exemplo n.º 16
0
def evaluate(SMASH, which_dataset, batch_size, seed, validate, num_random,
             num_perturb, num_markov, perturb_prob, arch_SGD, fp16, parallel):

    # Random seeds
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)

    num_runs = num_random + num_perturb + num_markov
    random_sample = True
    perturb = False
    markov = False

    net = torch.load('weights/' + SMASH + '.pth')
    net.eval()

    # Backwards compatibility hack; If you're trying to understand this code,
    # ignore this line.
    if not hasattr(net, 'factors'):
        net.factors = factors(net.N)

    _, test_loader = get_data_loader(which_dataset=which_dataset,
                                     augment=False,
                                     validate=validate,
                                     batch_size=batch_size)

    # Prepare lists that hold errors
    ensemble_err, err, flops, params = [], [], [], []

    # Array to which we save configurations and errors
    save_archs = []

    # Prepare ensemble predictions
    ensemble_out = torch.zeros(len(test_loader.dataset),
                               net.fc.out_features).cuda()

    # Start the stopwatch and begin testing
    start_time = time.time()
    mode = 'training' if net.training else 'testing'
    print('Evaluating %s in %s mode...' % (SMASH, mode))
    for test in range(num_runs):

        # If we've done all our random samples, switch to random perturbation mode
        if test == num_random:
            sorted_archs = sorted(save_archs, key=lambda item: item[-1])
            print(
                'Random sampling complete with best error of %f, starting perturbation...'
                % (sorted_archs[0][-1]))
            base_arch = sorted_archs[0][:10]
            perturb = True
            random_sample = False

        # If we've done all our perturbations, switch to markov chain mode
        elif test == num_random + num_perturb:
            sorted_archs = sorted(save_archs, key=lambda item: item[-1])
            print(
                'Random perturbation complete with best error of %f, starting markov chain...'
                % (sorted_archs[0][-1]))
            base_arch = sorted_archs[0][:10]
            current_error = sorted_archs[0][-1]
            markov = True

        # Sample a random architecture, as in training
        if random_sample:
            arch = net.sample_architecture()

        # Slightly change a sampled (and, presumably, high-scoring) architecture
        elif perturb:
            arch = perturb_arch.perturb_architecture(net, deepcopy(base_arch),
                                                     perturb_prob)

        #Sample Weights
        w1x1 = net.sample_weights(*arch)

        # Error counters
        e, ensemble_e = 0, 0

        # Loop over validation set
        for i, (x, y) in enumerate(test_loader):

            # Get outputs
            o = net(V(x.cuda(), volatile=True), w1x1, *arch)

            # Get predictions ensembled across multiple configurations
            ensemble_out[i * batch_size:(i + 1) * batch_size] += o.data

            # Update error
            e += o.data.max(1)[1].cpu().ne(y).sum()

            # Update ensemble error
            ensemble_e += ensemble_out[i * batch_size:(i + 1) *
                                       batch_size].max(1)[1].cpu().ne(y).sum()

        # Save ensemble error thus far
        ensemble_err.append(float(ensemble_e) / ensemble_out.size(0))

        # Save individual error thus far
        err.append(float(e) / ensemble_out.size(0))

        # While in markov mode, update the base arch if we get a better SMAS hscore.
        if markov and err[-1] < float(current_error):
            print(
                'Error of %f superior to error of %f, accepting new architecture...'
                % (err[-1], current_error))
            base_arch = arch
            current_error = err[-1]

        # Save relevant architectural details along with error
        save_archs.append(arch +
                          (net.N, net.N_max, net.bottleneck,
                           net.max_bottleneck, net.in_channels, 0, err[-1]))

        params.append(count_params(save_archs[-1]))
        flops.append(count_flops(save_archs[-1], which_dataset))
        print(
            'For run #%d/%d, Individual Error %2.2f Ensemble Err %2.2f, params %e, flops %e,  Time Elapsed %d.'
            % (test, num_runs, 100 * err[-1], 100 * ensemble_err[-1],
               params[-1], flops[-1], time.time() - start_time)
        )  #LogSof EnsErr %d, Softmax EnsErr %d ensemble_olgs_err[-1],  ensemble_os_err[-1],

    best_acc = sorted(err)[0]
    worst_acc = sorted(err)[-1]
    least_flops = sorted(flops)[0]
    most_flops = sorted(flops)[-1]
    least_params = sorted(params)[0]
    most_params = sorted(params)[-1]
    print('Best accuracy is ' + str(best_acc) + ', Worst accuracy is ' +
          str(worst_acc))

    # Save results
    # np.savez(filename[:-4] + '_' + mode + '_errors.npz', **{'err':err, 'ensemble_err':ensemble_err})
    # save_archs = sorted(save_archs, key = lambda item: item[-1])
    np.savez(
        SMASH + '_archs.npz', **{
            'archs': sorted(save_archs, key=lambda item: item[-1]),
            'unsorted_archs': save_archs
        })
Exemplo n.º 17
0
    def __init__(self, model, vocab):

        assert isinstance(model, dict) or isinstance(model, str)
        assert isinstance(vocab, tuple) or isinstance(vocab, str)

        # dataset
        logger.info('-' * 100)
        logger.info('Loading training and validation dataset')
        self.dataset = data.CodePtrDataset(mode='test')
        self.dataset_size = len(self.dataset)
        logger.info('Size of training dataset: {}'.format(self.dataset_size))

        logger.info('The dataset are successfully loaded')

        self.dataloader = DataLoader(dataset=self.dataset,
                                     batch_size=config.test_batch_size,
                                     collate_fn=lambda *args: utils.collate_fn(args,
                                                                               source_vocab=self.source_vocab,
                                                                               code_vocab=self.code_vocab,
                                                                               ast_vocab=self.ast_vocab,
                                                                               nl_vocab=self.nl_vocab,
                                                                               raw_nl=True))

        # vocab
        logger.info('-' * 100)
        if isinstance(vocab, tuple):
            logger.info('Vocabularies are passed from parameters')
            assert len(vocab) == 4
            self.source_vocab, self.code_vocab, self.ast_vocab, self.nl_vocab = vocab
        else:
            logger.info('Vocabularies are read from dir: {}'.format(vocab))
            self.source_vocab = utils.load_vocab(vocab, 'source')
            self.code_vocab = utils.load_vocab(vocab, 'code')
            self.ast_vocab = utils.load_vocab(vocab, 'ast')
            self.nl_vocab = utils.load_vocab(vocab, 'nl')

        # vocabulary
        self.source_vocab_size = len(self.source_vocab)
        self.code_vocab_size = len(self.code_vocab)
        self.ast_vocab_size = len(self.ast_vocab)
        self.nl_vocab_size = len(self.nl_vocab)

        logger.info('Size of source vocabulary: {} -> {}'.format(self.source_vocab.origin_size, self.source_vocab_size))
        logger.info('Size of code vocabulary: {} -> {}'.format(self.code_vocab.origin_size, self.code_vocab_size))
        logger.info('Size of ast vocabulary: {}'.format(self.ast_vocab_size))
        logger.info('Size of nl vocabulary: {} -> {}'.format(self.nl_vocab.origin_size, self.nl_vocab_size))

        logger.info('Vocabularies are successfully built')

        # model
        logger.info('-' * 100)
        logger.info('Building model')
        self.model = models.Model(source_vocab_size=self.source_vocab_size,
                                  code_vocab_size=self.code_vocab_size,
                                  ast_vocab_size=self.ast_vocab_size,
                                  nl_vocab_size=self.nl_vocab_size,
                                  is_eval=True,
                                  model=model)
        # model device
        logger.info('Model device: {}'.format(next(self.model.parameters()).device))
        # log model statistic
        logger.info('Trainable parameters: {}'.format(utils.human_format(utils.count_params(self.model))))
    def run(self):
        # load data
        args_dict = self._default_configs()
        args = dotdict(args_dict)
        batchedData, maxTimeSteps, totalN = self.load_data(args,
                                                           mode=mode,
                                                           type=level)
        model = model_fn(args, maxTimeSteps)

        # count the num of params
        num_params = count_params(model, mode='trainable')
        all_num_params = count_params(model, mode='all')
        model.config['trainable params'] = num_params
        model.config['all params'] = all_num_params
        print(model.config)

        #with tf.Session(graph=model.graph) as sess:
        with tf.Session() as sess:
            # restore from stored model
            if keep == True:
                ckpt = tf.train.get_checkpoint_state(savedir)
                if ckpt and ckpt.model_checkpoint_path:
                    model.saver.restore(sess, ckpt.model_checkpoint_path)
                    print('Model restored from:' + savedir)
            else:
                print('Initializing')
                sess.run(model.initial_op)

            for epoch in range(num_epochs):
                ## training
                start = time.time()
                if mode == 'train':
                    print('Epoch', epoch + 1, '...')
                batchErrors = np.zeros(len(batchedData))
                batchRandIxs = np.random.permutation(len(batchedData))

                for batch, batchOrigI in enumerate(batchRandIxs):
                    batchInputs, batchTargetSparse, batchSeqLengths = batchedData[
                        batchOrigI]
                    batchTargetIxs, batchTargetVals, batchTargetShape = batchTargetSparse
                    feedDict = {
                        model.inputX: batchInputs,
                        model.targetIxs: batchTargetIxs,
                        model.targetVals: batchTargetVals,
                        model.targetShape: batchTargetShape,
                        model.seqLengths: batchSeqLengths
                    }

                    if level == 'cha':
                        if mode == 'train':
                            _, l, pre, y, er = sess.run([
                                model.optimizer, model.loss, model.predictions,
                                model.targetY, model.errorRate
                            ],
                                                        feed_dict=feedDict)

                            batchErrors[batch] = er
                            print(
                                '\n{} mode, total:{},batch:{}/{},epoch:{}/{},train loss={:.3f},mean train CER={:.3f}\n'
                                .format(level, totalN, batch + 1,
                                        len(batchRandIxs), epoch + 1,
                                        num_epochs, l, er / batch_size))

                        elif mode == 'test':
                            l, pre, y, er = sess.run([
                                model.loss, model.predictions, model.targetY,
                                model.errorRate
                            ],
                                                     feed_dict=feedDict)
                            batchErrors[batch] = er
                            print(
                                '\n{} mode, total:{},batch:{}/{},test loss={:.3f},mean test CER={:.3f}\n'
                                .format(level, totalN, batch + 1,
                                        len(batchRandIxs), l, er / batch_size))

                    elif level == 'phn':
                        if mode == 'train':
                            _, l, pre, y = sess.run([
                                model.optimizer, model.loss, model.predictions,
                                model.targetY
                            ],
                                                    feed_dict=feedDict)

                            er = get_edit_distance([pre.values], [y.values],
                                                   True, level)
                            print(
                                '\n{} mode, total:{},batch:{}/{},epoch:{}/{},train loss={:.3f},mean train PER={:.3f}\n'
                                .format(level, totalN, batch + 1,
                                        len(batchRandIxs), epoch + 1,
                                        num_epochs, l, er))
                            batchErrors[batch] = er * len(batchSeqLengths)
                        elif mode == 'test':
                            l, pre, y = sess.run(
                                [model.loss, model.predictions, model.targetY],
                                feed_dict=feedDict)
                            er = get_edit_distance([pre.values], [y.values],
                                                   True, level)
                            print(
                                '\n{} mode, total:{},batch:{}/{},test loss={:.3f},mean test PER={:.3f}\n'
                                .format(level, totalN, batch + 1,
                                        len(batchRandIxs), l, er))
                            batchErrors[batch] = er * len(batchSeqLengths)

                    # NOTE:
                    if er / batch_size == 1.0:
                        break

                    if batch % 30 == 0:
                        print('Truth:\n' + output_to_sequence(y, type=level))
                        print('Output:\n' +
                              output_to_sequence(pre, type=level))

                    if mode == 'train' and (
                        (epoch * len(batchRandIxs) + batch + 1) % 20 == 0 or
                        (epoch == num_epochs - 1
                         and batch == len(batchRandIxs) - 1)):
                        checkpoint_path = os.path.join(savedir, 'model.ckpt')
                        model.saver.save(sess,
                                         checkpoint_path,
                                         global_step=epoch)
                        print('Model has been saved in {}'.format(savedir))
                end = time.time()
                delta_time = end - start
                print('Epoch ' + str(epoch + 1) + ' needs time:' +
                      str(delta_time) + ' s')

                if mode == 'train':
                    if (epoch + 1) % 1 == 0:
                        checkpoint_path = os.path.join(savedir, 'model.ckpt')
                        model.saver.save(sess,
                                         checkpoint_path,
                                         global_step=epoch)
                        print('Model has been saved in {}'.format(savedir))
                    epochER = batchErrors.sum() / totalN
                    print('Epoch', epoch + 1, 'mean train error rate:',
                          epochER)
                    logging(model,
                            logfile,
                            epochER,
                            epoch,
                            delta_time,
                            mode='config')
                    logging(model,
                            logfile,
                            epochER,
                            epoch,
                            delta_time,
                            mode=mode)

                if mode == 'test':
                    with open(os.path.join(resultdir, level + '_result.txt'),
                              'a') as result:
                        result.write(output_to_sequence(y, type=level) + '\n')
                        result.write(
                            output_to_sequence(pre, type=level) + '\n')
                        result.write('\n')
                    epochER = batchErrors.sum() / totalN
                    print(' test error rate:', epochER)
                    logging(model, logfile, epochER, mode=mode)
Exemplo n.º 19
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model_or_config_file",
        default=None,
        type=str,
        required=True,
        help=(
            "Path of configuration file (if starting from scratch) or directory"
            " containing checkpoint (if resuming) or directory containig a"
            " pretrained model and tokenizer (if re-training)."))

    # Use for resuming from checkpoint
    parser.add_argument("--resume",
                        action='store_true',
                        help="Resume from checkpoint")

    # Required if not resuming
    parser.add_argument(
        "--dir_train_data",
        type=str,
        help=
        "Path of a directory containing training files (names must all match <lang>.train)"
    )
    parser.add_argument(
        "--path_vocab",
        type=str,
        help=
        "Path of a 2-column TSV file containing the vocab of chars and their frequency."
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--mlm_only",
        action="store_true",
        help=
        ("Use only masked language modeling, no sentence pair classification "
         " (e.g. if you only have unk.train in your training directory)"))
    parser.add_argument(
        "--avgpool_for_spc",
        action="store_true",
        help=
        ("Use average pooling of all last hidden states, rather than just the last hidden state of CLS, to do SPC. "
         "Note that in either case, the pooled vector passes through a square linear layer and a tanh before the classification layer."
         ))
    parser.add_argument(
        "--sampling_alpha",
        type=float,
        default=1.0,
        help=
        "Dampening factor for relative frequencies used to compute language sampling probabilities"
    )
    parser.add_argument(
        "--weight_relevant",
        type=float,
        default=1.0,
        help=
        "Relative sampling frequency of relevant languages wrt irrelevant languages"
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        "--seq_len",
        default=128,
        type=int,
        help=
        "Length of input sequences. Shorter seqs are padded, longer ones are trucated"
    )
    parser.add_argument(
        "--min_freq",
        default=1,
        type=int,
        help=
        "Minimum character frequency. Characters whose frequency is under this threshold will be mapped to <UNK>"
    )
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for AdamW optimizer.")
    parser.add_argument("--equal_betas",
                        action='store_true',
                        help="Use beta1=beta2=0.9 for AdamW optimizer.")
    parser.add_argument(
        "--correct_bias",
        action='store_true',
        help=
        "Correct bias in AdamW optimizer (correct_bias=False is meant to reproduce BERT behaviour exactly."
    )
    parser.add_argument(
        "--max_train_steps",
        default=1000000,
        type=int,
        help=
        "Maximum number of training steps to perform. Note: # optimization steps = # train steps / # accumulation steps."
    )
    parser.add_argument(
        "--num_train_steps_per_epoch",
        default=1000,
        type=int,
        help=
        "Number of training steps that equals one epoch. Note: # optimization steps = # train steps / # accumulation steps."
    )
    parser.add_argument(
        "--num_warmup_steps",
        default=10000,
        type=int,
        help=
        "Number of optimization steps (i.e. training steps / accumulation steps) to perform linear learning rate warmup for. "
    )
    parser.add_argument(
        '--grad_accum_steps',
        type=int,
        default=1,
        help=
        "Number of training steps (i.e. batches) to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        "--num_gpus",
        type=int,
        default=-1,
        help="Num GPUs to use for training (0 for none, -1 for all available)")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    # These args are required if we are not resuming from checkpoint
    if not args.resume:
        assert args.dir_train_data is not None
        assert args.path_vocab is not None
        assert args.output_dir is not None

    # Check whether we are starting from scratch, resuming from a checkpoint, or retraining a pretrained model
    from_scratch = (not args.resume) and (not os.path.isdir(
        args.bert_model_or_config_file))
    retraining = (not args.resume) and (not from_scratch)

    # Load config. Load or create checkpoint data.
    if from_scratch:
        logger.info("***** Starting pretraining job from scratch *******")
        config = BertConfig.from_json_file(args.bert_model_or_config_file)
        checkpoint_data = {}
    elif retraining:
        logger.info(
            "***** Starting pretraining job from pre-trained model *******")
        logger.info("Loading pretrained model...")
        model = BertModelForMaskedLM.from_pretrained(
            args.bert_model_or_config_file)
        config = model.config
        checkpoint_data = {}
    elif args.resume:
        logger.info("***** Resuming pretraining job *******")
        logger.info("Loading checkpoint...")
        checkpoint_path = os.path.join(args.bert_model_or_config_file,
                                       "checkpoint.tar")
        checkpoint_data = torch.load(checkpoint_path)
        # Make sure we haven't already done the maximum number of optimization steps
        if checkpoint_data["global_step"] >= checkpoint_data["max_opt_steps"]:
            msg = "We have already done %d optimization steps." % checkpoint_data[
                "global_step"]
            raise RuntimeError(msg)
        logger.info("Resuming from global step %d" %
                    checkpoint_data["global_step"])
        # Replace args with initial args for this job, except for num_gpus, seed and model directory
        current_num_gpus = args.num_gpus
        current_seed = args.seed
        checkpoint_dir = args.bert_model_or_config_file
        args = deepcopy(checkpoint_data["initial_args"])
        args.num_gpus = current_num_gpus
        args.seed = current_seed
        args.bert_model_or_config_file = checkpoint_dir
        args.resume = True
        logger.info("Args (most have been reloaded from checkpoint): %s" %
                    args)
        # Load config
        config_path = os.path.join(args.bert_model_or_config_file,
                                   "config.json")
        config = BertConfig.from_json_file(config_path)

    # Check args
    assert args.sampling_alpha >= 0 and args.sampling_alpha <= 1
    assert args.weight_relevant > 0
    if args.grad_accum_steps < 1:
        raise ValueError(
            "Invalid grad_accum_steps parameter: {}, should be >= 1".format(
                args.grad_accum_steps))
    train_paths = glob.glob(os.path.join(args.dir_train_data, "*.train"))
    assert len(train_paths) > 0
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if (not args.resume) and len(os.listdir(args.output_dir)) > 0:
        msg = "Directory %s is not empty" % args.output_dir
        raise ValueError(msg)

    # Make or load tokenizer
    if args.resume or retraining:
        logger.info("Loading tokenizer...")
        tokenizer_path = os.path.join(args.bert_model_or_config_file,
                                      "tokenizer.pkl")
        with open(tokenizer_path, "rb") as f:
            tokenizer = pickle.load(f)
    elif from_scratch:
        logger.info("Making tokenizer...")

        assert os.path.exists(args.path_vocab)
        tokenizer = CharTokenizer(args.path_vocab)
        if args.min_freq > 1:
            tokenizer.trim_vocab(args.min_freq)
        # Adapt vocab size in config
        config.vocab_size = len(tokenizer.vocab)

        # Save tokenizer
        fn = os.path.join(args.output_dir, "tokenizer.pkl")
        with open(fn, "wb") as f:
            pickle.dump(tokenizer, f)
    logger.info("Size of vocab: {}".format(len(tokenizer.vocab)))

    # Copy config in output directory
    if not args.resume:
        config_path = os.path.join(args.output_dir, "config.json")
        config.to_json_file(config_path)

    # What GPUs do we use?
    if args.num_gpus == -1:
        args.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        args.n_gpu = torch.cuda.device_count()
        device_ids = None
    else:
        args.device = torch.device("cuda" if torch.cuda.is_available()
                                   and args.num_gpus > 0 else "cpu")
        args.n_gpu = args.num_gpus
        if args.n_gpu > 1:
            device_ids = list(range(args.n_gpu))
    if args.local_rank != -1:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Seed RNGs
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Prepare model
    if from_scratch or args.resume:
        model = BertForMaskedLM(config)
        if args.resume:
            model.load_state_dict(checkpoint_data["model_state_dict"])
    model.to(args.device)

    # Prepare pooler (if we are doing SPC)
    if args.mlm_only:
        pooler = None
    else:
        pooler = Pooler(model.config.hidden_size,
                        cls_only=(not args.avgpool_for_spc))
        if args.resume:
            pooler.load_state_dict(checkpoint_data["pooler_state_dict"])
        pooler.to(args.device)

    # Distributed or parallel?
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed training."
            )
        model = DDP(model)
        if not args.mlm_only:
            pooler = DDP(pooler)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=device_ids)
        pooler = torch.nn.DataParallel(pooler, device_ids=device_ids)

    # Log some info on the model
    logger.info("Model config: %s" % repr(model.config))
    logger.info("Nb params: %d" % count_params(model))
    if not args.mlm_only:
        logger.info("Nb params in pooler: %d" % count_params(pooler))

    # Check if there is unk training data.
    path_unk = check_for_unk_train_data(train_paths)

    # Get training data
    max_seq_length = args.seq_len + 2  # We add 2 for CLS and SEP
    logger.info("Preparing dataset using data from %s" % args.dir_train_data)
    if args.mlm_only:
        # We only want to do MLM
        train_dataset_spc = None
        train_dataset_mlm = BertDatasetForMLM(
            train_paths,
            tokenizer,
            max_seq_length,
            sampling_alpha=args.sampling_alpha,
            weight_relevant=args.weight_relevant,
            encoding="utf-8",
            seed=args.seed)

    else:
        # We want do to SLC and MLM. If unk data is present, we remove
        # it from the paths provided to BertLabeledDataset.
        if path_unk is not None:
            train_paths.remove(path_unk)
        train_dataset_spc = BertDatasetForSPCAndMLM(
            train_paths,
            tokenizer,
            max_seq_length,
            sampling_alpha=args.sampling_alpha,
            weight_relevant=args.weight_relevant,
            encoding="utf-8",
            seed=args.seed)
        if path_unk is None:
            train_dataset_mlm = None
        else:
            # In this case we use a BertDatasetForMLM for the unk
            # data. Both datasets will be of the same size. The latter
            # is used for MLM only.
            train_dataset_mlm = BertDatasetForMLM(
                [path_unk],
                tokenizer,
                max_seq_length,
                sampling_alpha=args.sampling_alpha,
                weight_relevant=args.weight_relevant,
                encoding="utf-8",
                seed=args.seed)
            assert len(train_dataset_spc) == len(train_dataset_mlm)

    # Check length of dataset
    dataset_length = len(
        train_dataset_spc) if train_dataset_spc is not None else len(
            train_dataset_mlm)

    # Store optimization steps performed and maximum number of optimization steps
    if not args.resume:
        checkpoint_data["global_step"] = 0
        checkpoint_data[
            "max_opt_steps"] = args.max_train_steps // args.grad_accum_steps

    # Compute number of optimization steps per epoch
    num_opt_steps_per_epoch = args.num_train_steps_per_epoch // args.grad_accum_steps

    # Compute number of epochs necessary to reach the maximum number of optimization steps
    opt_steps_left = checkpoint_data["max_opt_steps"] - checkpoint_data[
        "global_step"]
    args.num_epochs = math.ceil(opt_steps_left / num_opt_steps_per_epoch)

    # Log some info before training
    logger.info("*** Training info: ***")
    logger.info("Max training steps: %d" % args.max_train_steps)
    logger.info("Gradient accumulation steps: %d" % args.grad_accum_steps)
    logger.info("Max optimization steps: %d" %
                checkpoint_data["max_opt_steps"])
    if args.resume:
        logger.info("Nb optimization steps done so far: %d" %
                    checkpoint_data["global_step"])
    logger.info("Total dataset size: %d examples" % (dataset_length))
    logger.info("Batch size: %d" % args.train_batch_size)
    logger.info("# training steps/epoch: %d" %
                (args.num_train_steps_per_epoch))
    logger.info("# optimization steps/epoch: %d" % (num_opt_steps_per_epoch))
    logger.info("# epochs to do: %d" % (args.num_epochs))

    # Prepare optimizer
    logger.info("Preparing optimizer...")
    np_list = list(model.named_parameters())
    if not args.mlm_only:
        np_list += list(pooler.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    opt_params = [{
        'params':
        [p for n, p in np_list if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params': [p for n, p in np_list if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.equal_betas:
        betas = (0.9, 0.9)
    else:
        betas = (0.9, 0.999)
    optimizer = AdamW(
        opt_params,
        lr=args.learning_rate,
        betas=betas,
        correct_bias=args.correct_bias
    )  # To reproduce BertAdam specific behaviour, use correct_bias=False
    if args.resume:
        optimizer.load_state_dict(checkpoint_data["optimizer_state_dict"])

    # Prepare scheduler
    logger.info("Preparing learning rate scheduler...")
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=checkpoint_data["max_opt_steps"])
    if args.resume:
        scheduler.load_state_dict(checkpoint_data["scheduler_state_dict"])
        logger.info("Current learning rate: %f" % scheduler.get_last_lr()[0])

    # Save initial training args
    if not args.resume:
        checkpoint_data["initial_args"] = args

    # Prepare training log
    time_str = datetime.now().strftime("%Y%m%d%H%M%S")
    train_log_path = os.path.join(args.output_dir, "%s.train.log" % time_str)
    args.train_log_path = train_log_path

    # Train
    if args.mlm_only:
        train(model,
              None,
              tokenizer,
              optimizer,
              scheduler,
              train_dataset_mlm,
              args,
              checkpoint_data,
              extra_mlm_dataset=None)
    else:
        train(model,
              pooler,
              tokenizer,
              optimizer,
              scheduler,
              train_dataset_spc,
              args,
              checkpoint_data,
              extra_mlm_dataset=train_dataset_mlm)
Exemplo n.º 20
0
        run_tstart = time.time()
        for epoch in range(args.epochs):
            train(model, optimizer, epoch)

        run_time = time.time() - run_tstart
        run_loss, run_acc = test(model, optimizer)

        if args.runs > 1:
            print("Run #{run} Test -- time: {time}s acc: {acc:.2f}%".format(
                run=run, time=run_time, acc=100 * run_acc),
                  flush=True)
    except KeyboardInterrupt:
        args.runs = run
        break

    total_loss += run_loss
    total_acc += run_acc
    total_time += run_time

total_loss, total_acc, total_time = map(lambda x: x / args.runs,
                                        [total_loss, total_acc, total_time])

print("Optimization on dataset \"{dataset}\" Finished!".format(
    dataset=args.dataset))
print("#Parameters: {param_count}".format(param_count=count_params(model)))
print("Average time elapsed: {:.4f}s".format(total_time))

# Testing
print("Test set results:", "avg loss= {:.4f}".format(total_loss),
      "avg accuracy= {:.4f}".format(total_acc))
Exemplo n.º 21
0
def train():

    tf.global_variables_initializer().run()

    could_load, checkpoint_counter = load()
    if could_load:
        start_epoch = (int)(checkpoint_counter / num_batches)
        start_batch_id = checkpoint_counter - start_epoch * num_batches
        counter = checkpoint_counter
        print("Checkpoint Load Successed")

    else:
        start_epoch = 0
        start_batch_id = 0
        counter = 1
        print("train from scratch...")

    train_iter=[]
    train_loss=[]

    utils.count_params()
    print("Total image:{}".format(len(train_img)))
    print("Total epoch:{}".format(args.num_epochs))
    print("Batch size:{}".format(args.batch_size))
    print("Learning rate:{}".format(args.learning_rate))
    print("Checkpoint step:{}".format(args.checkpoint_step))

    print("Data Argument:")
    print("h_flip: {}".format(args.h_flip))
    print("v_flip: {}".format(args.v_flip))
    print("rotate: {}".format(args.rotation))
    print("clip size: {}".format(args.clip_size))


    for i in range(start_epoch,args.num_epochs):
        id_list = np.random.permutation(len(train_img))

        epoch_time=time.time()
        for j in range(start_batch_id,num_batches):
            img_d=[]
            lab_d=[]
            for ind in range(args.batch_size):
                id = id_list[j * args.batch_size + ind]
                img_d.append(train_img[id])
                lab_d.append(train_label[id])

            x_batch, y_batch = load_batch(img_d,lab_d,args)
            feed_dict = {img: x_batch,
                         label: y_batch

                         }
            loss_tmp = []
            _, loss, pred1 = sess.run([train_step, sigmoid_cross_entropy_loss, pred], feed_dict=feed_dict)
            loss_tmp.append(loss)
            if (counter % 100 == 0):
                tmp = np.mean(loss_tmp)
                train_iter.append(counter)
                train_loss.append(tmp)
                print('Epoch', i, '|Iter', counter, '|Loss', tmp)
            counter += 1
        start_batch_id=0
        print('Time:', time.time() - epoch_time)

        #if((i+1)%10==0 ):#lr dst from 10 every 10 epoches by 0.1
            #learning_rate = 0.1 * learning_rate
        #last_checkpoint_name = "checkpoint/latest_model_epoch_" + "_pspet.ckpt"
        # print("Saving latest checkpoint")
        # saver.save(sess, last_checkpoint_name)


        if((i+1)%args.checkpoint_step==0):#save 20,30,40,50 checkpoint
            args.learning_rate=0.1*args.learning_rate
            print(args.learning_rate)

            saver.save(sess,'./checkpoint/model.ckpt',global_step=counter,write_meta_graph=True)


            """
            host = host_subplot(111)
            plt.subplots_adjust(right=0.8)
            p1, = host.plot(train_iter, train_loss, label="training loss")
            host.legend(loc=5)
            host.axis["left"].label.set_color(p1.get_color())
            host.set_xlim([0, counter])
            plt.draw()
            plt.show()
            """
            fig1, ax1 = plt.subplots(figsize=(11, 8))
            ax1.plot(train_iter, train_loss)
            ax1.set_title("Training loss vs Iter")
            ax1.set_xlabel("Iter")
            ax1.set_ylabel("Training loss")
            plt.savefig('Training loss_vs_Iter.png')
            plt.clf()

        remain_time=(args.num_epochs - 1 - i) * (time.time() - epoch_time)
        m, s = divmod(remain_time, 60)
        h, m = divmod(m, 60)
        print("Remaining training time = %d hours %d minutes %d seconds\n" % (h, m, s))
 def num_params(self):
     return count_params(self)
def main():
    args = parse_args()

    if args.name is None:
        if args.deepsupervision:
            args.name = '%s_%s_wDS' % (args.dataset, args.arch)
        else:
            args.name = '%s_%s_woDS' % (args.dataset, args.arch)
    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('%s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('%s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    # define loss function (criterion)
    if args.loss == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda()
    else:
        criterion = losses.__dict__[args.loss]().cuda()

    cudnn.benchmark = True

    DATA_PATH = '../../Datasets/'

    img_paths = []
    mask_paths = []
    for class_folder in os.listdir(DATA_PATH):
        FOLDER_PATH = os.path.join(DATA_PATH, class_folder)
        for patient_folder in os.listdir(FOLDER_PATH):
            patient_folder = os.path.join(FOLDER_PATH, patient_folder)
            if os.path.isdir(patient_folder):
                if (os.path.isfile(
                        os.path.join(patient_folder, 'AP/Ap_Pedicle.png'))):
                    mask_paths.append(
                        os.path.join(patient_folder, 'AP/Ap_Pedicle.png'))
                    img_paths.append(os.path.join(patient_folder, "AP.jpg"))

    c = list(zip(img_paths, mask_paths))

    random.shuffle(c)

    img_paths, mask_paths = zip(*c)
    img_paths = np.array(img_paths)
    mask_paths = np.array(mask_paths)

    train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \
        train_test_split(img_paths, mask_paths, test_size=0.05, random_state=41)

    # create model
    print("=> creating model %s" % args.arch)
    model = archs.__dict__[args.arch](args)

    model = model.cuda()

    print(count_params(model))

    if args.optimizer == 'Adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay,
                              nesterov=args.nesterov)

    train_dataset = Dataset(args, train_img_paths, train_mask_paths, args.aug)
    val_dataset = Dataset(args, val_img_paths, val_mask_paths)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               pin_memory=True,
                                               drop_last=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             pin_memory=True,
                                             drop_last=False)

    log = pd.DataFrame(
        index=[],
        columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

    best_iou = 0
    trigger = 0
    for epoch in range(args.epochs):
        print('Epoch [%d/%d]' % (epoch, args.epochs))

        # train for one epoch
        train_log = train(args, train_loader, model, criterion, optimizer,
                          epoch)
        # evaluate on validation set
        val_log = validate(args, val_loader, model, criterion)

        print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' %
              (train_log['loss'], train_log['iou'], val_log['loss'],
               val_log['iou']))

        tmp = pd.Series(
            [
                epoch,
                args.lr,
                train_log['loss'],
                train_log['iou'],
                val_log['loss'],
                val_log['iou'],
            ],
            index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

        log = log.append(tmp, ignore_index=True)
        log.to_csv('models/%s/log.csv' % args.name, index=False)

        trigger += 1

        if val_log['iou'] > best_iou:
            torch.save(model.state_dict(), './models/%s/model.pth' % args.name)
            best_iou = val_log['iou']
            print("=> saved best model")
            trigger = 0

        # early stopping
        if not args.early_stop is None:
            if trigger >= args.early_stop:
                print("=> early stopping")
                break

        torch.cuda.empty_cache()
Exemplo n.º 24
0
import data

from deepsense import neptune

ctx = neptune.Context()

model_name = ctx.params['model']
epochs = ctx.params['epochs']
learning_rate = ctx.params['learning_rate']

ctx.tags.append(model_name)

# data
dataloaders = data.get_dataloaders('/input', batch_size=128)

# network
model = models.MODELS[model_name]
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(size_average=False)

print("Network created. Number of parameters:")
print(utils.count_params(model))

# training
trained_model = utils.train_model(model,
                                  criterion,
                                  optimizer,
                                  dataloaders,
                                  num_epochs=epochs)
utils.save_all(trained_model)
Exemplo n.º 25
0
def _spec(net, xentPerExample, is_accum=False, nohess=False, randvec=False):
  """returns principal eig of the hessian"""

  if nohess:
    net.valtotEager = net.bzEager = net.valEager = net.valtotAccum = net.bzAccum = net.valAccum = tf.constant(0, tf.float32)
    net.projvec = net.projvec_op = net.projvec_corr = tf.constant(0, tf.float32)
    return

  batchsize = tf.shape(xentPerExample)[0]
  xent = tf.reduce_sum(xentPerExample)

  # decide weights from which to compute the spectral radius
  print('Number of trainable weights: ' + str(utils.count_params(tf.trainable_variables())))
  if not net.args.specreg_bn: # don't include batch norm weights
    net.regularizable = []
    for var in tf.trainable_variables():
      if var.op.name.find('logit/dense/kernel') > -1 or var.op.name.find(r'DW') > -1:
        net.regularizable.append(var)
    print('Number of regularizable weights: ' + str(utils.count_params(net.regularizable)))
  else:
    net.regularizable = tf.trainable_variables() # do include bn weights
    print('Still zeroing out bias and bn variables in hessian calculation in utils.filtnorm function')

  # create initial projection vector (randomly and normalized)
  projvec_init = [np.random.randn(*r.get_shape().as_list()) for r in net.regularizable]
  magnitude = np.sqrt(np.sum([np.sum(p**2) for p in projvec_init]))
  projvec_init = [p/magnitude for p in projvec_init]

  # projection vector tensor variable
  net.count = net.count + 1 if hasattr(net, 'count') else 0
  with tf.variable_scope('projvec/'+str(net.count)):
    net.projvec = [tf.get_variable(name=r.op.name, dtype=tf.float32, shape=r.get_shape(),
                                   trainable=False, initializer=tf.constant_initializer(p))
                   for r,p in zip(net.regularizable, projvec_init)]

  # compute filter normalization
  print('normalization scheme: '+net.args.normalizer)
  if net.args.normalizer == None or net.args.normalizer=='None':
    projvec_mul_normvalues = net.projvec
  else:
    if net.args.normalizer == 'filtnorm': normalizer = utils.filtnorm
    elif net.args.normalizer == 'layernorm': normalizer = utils.layernorm
    elif net.args.normalizer == 'layernormdev': normalizer = utils.layernormdev
    net.normvalues = normalizer(net.regularizable)
    projvec_mul_normvalues = [n*p for n,p in zip(net.normvalues, net.projvec)]

  # get gradient of loss wrt inputs
  tstart = time.time(); gradLoss = tf.gradients(xent, net.regularizable); print('Built gradLoss: ' + str(time.time() - tstart) + ' s')

  # get hessian vector product
  tstart = time.time()
  hessVecProd = tf.gradients(gradLoss, net.regularizable, projvec_mul_normvalues)
  # hessVecProd = [h*n for h,n in zip(hessVecProd, net.normvalues)]
  print('Built hessVecProd: ' + str(time.time() - tstart) + ' s')

  # build graph for full-batch hessian calculations which require accum ops and storage variables (for validation)
  if is_accum:

    # create op to accumulate gradients
    with tf.variable_scope('accum'):
      hessvecprodAccum = [tf.Variable(tf.zeros_like(h), trainable=False, name=h.op.name) for h in hessVecProd]
      batchsizeAccum = tf.Variable(0, trainable=False, name='batchsizeAccum')
      net.zero_op = [a.assign(tf.zeros_like(a)) for a in hessvecprodAccum] + [batchsizeAccum.assign(0)]
      net.accum_op = [a.assign_add(g) for a,g in zip(hessvecprodAccum, hessVecProd)] + [batchsizeAccum.assign_add(batchsize)]

    # compute the projected projection vector using accumulated hvps
    nextProjvec = compute_nextProjvec(net.projvec, hessvecprodAccum, net.projvec_beta, randvec=randvec)
    print('nextProjvec using accumed hvp')

    # hooks for total eigenvalue, batch size, and eigenvalue
    net.valtotAccum = utils.list2dotprod(net.projvec, hessvecprodAccum)
    net.bzAccum = tf.to_float(batchsizeAccum)
    net.valAccum = net.valtotAccum / net.bzAccum

  # build graph for on-the-fly per-batch hessian calcuations (for training)
  else:

    # compute the projected projection vector using instantaneous hvp
    nextProjvec = compute_nextProjvec(net.projvec, hessVecProd, net.projvec_beta, randvec=randvec)
    print('nextProjvec using instant hvp and randvec is', randvec)

    # hooks for total eigenvalue, batch size, and eigenvalue
    net.valtotEager = utils.list2dotprod(net.projvec, hessVecProd)
    net.bzEager = tf.to_float(batchsize)
    net.valEager = net.valtotEager / net.bzEager

  # dotprod and euclidean distance of new projection vector from previous
  net.projvec_corr = utils.list2dotprod(nextProjvec, net.projvec)

  # op to assign the new projection vector for next iteration
  with tf.control_dependencies([net.projvec_corr]):
    with tf.variable_scope('projvec_op'):
      net.projvec_op = [tf.assign(p,n) for p,n in zip(net.projvec, nextProjvec)]
Exemplo n.º 26
0
    def train(self,args):
	''' import data, train model, save model
	'''
	args.data_dir = args.data_dir+args.style+'/'
	args.save_dir = args.save_dir+args.style+'/'
	print(args)
	if args.attention is True:
	    print('attention mode')
        text_parser = TextParser(args)
        args.vocab_size = text_parser.vocab_size
	if args.pretrained is True:
	    raise ValueError('pretrained has bug now, so don"t set it to be True now!!!')
	    if args.keep is False:
		raise ValueError('when pre-trained is True, keep must be true!')
	    print("pretrained and keep mode...")
	    print("restoring pretrained model file")
            ckpt = tf.train.get_checkpoint_state("/home/pony/github/jaylyrics_generation_tensorflow/data/pre-trained/")
	    if os.path.exists(os.path.join("./data/pre-trained/",'config.pkl')) and \
		os.path.exists(os.path.join("./data/pre-trained/",'words_vocab.pkl')) and \
		ckpt and ckpt.model_checkpoint_path:
                with open(os.path.join("./data/pre-trained/", 'config.pkl'), 'rb') as f:
                    saved_model_args = cPickle.load(f)
                with open(os.path.join("./data/pre-trained/", 'words_vocab.pkl'), 'rb') as f:
                    saved_words, saved_vocab = cPickle.load(f)
	    else:
		raise ValueError('configuration doesn"t exist!')
	else:
	    ckpt = tf.train.get_checkpoint_state(args.save_dir)
	    
        if args.keep is True and args.pretrained is False:
            # check if all necessary files exist 
	    if os.path.exists(os.path.join(args.save_dir,'config.pkl')) and \
		os.path.exists(os.path.join(args.save_dir,'words_vocab.pkl')) and \
		ckpt and ckpt.model_checkpoint_path:
                with open(os.path.join(args.save_dir, 'config.pkl'), 'rb') as f:
                    saved_model_args = cPickle.load(f)
                with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'rb') as f:
                    saved_words, saved_vocab = cPickle.load(f)
	    else:
		raise ValueError('configuration doesn"t exist!')

	if args.model == 'seq2seq_rnn':
            model = Model_rnn(args)
	else:
	    # TO ADD OTHER MODEL
	    pass
	trainable_num_params = count_params(model,mode='trainable')
	all_num_params = count_params(model,mode='all')
	args.num_trainable_params = trainable_num_params
	args.num_all_params = all_num_params
        with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
            cPickle.dump(args, f)
        with open(os.path.join(args.save_dir, 'words_vocab.pkl'), 'wb') as f:
            cPickle.dump((text_parser.vocab_dict, text_parser.vocab_list), f)

        with tf.Session() as sess:
            if args.keep is True:
	        print('Restoring')
                model.saver.restore(sess, ckpt.model_checkpoint_path)
	    else:
		print('Initializing')
    	        sess.run(model.initial_op)

            sess.run(tf.assign(model.lr, args.learning_rate))
            for e in range(args.num_epochs):
                start = time.time()
		model.initial_state = tf.convert_to_tensor(model.initial_state) 
                state = model.initial_state.eval()
		total_loss = []
                for b in range(text_parser.num_batches):
                    x, y = text_parser.next_batch()
		    if args.attention is True:
		        attention_states = sess.run(tf.truncated_normal([args.batch_size,
						    model.attn_length, model.attn_size],
						    stddev=0.1,dtype=tf.float32))

	                feed = {model.input_data: x, model.targets: y, 
				model.initial_state: state, 
				model.attention_states:attention_states}

	    	    else:
                        feed = {model.input_data: x, 
				model.targets: y, 
				model.initial_state: state}

                    train_loss, state, _ = sess.run([model.cost, 
						     model.final_state, 
						     model.train_op], 
						     feed)
		    total_loss.append(train_loss)
                    print("{}/{} (epoch {}), train_loss = {:.3f}" \
                                .format(e * text_parser.num_batches + b, \
                                args.num_epochs * text_parser.num_batches, \
                                e, train_loss))
                    if (e*text_parser.num_batches+b)%args.save_every==0: 
                        checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                        model.saver.save(sess, checkpoint_path, global_step = e)
                        print("model has been saved in:"+str(checkpoint_path))
                end = time.time()
		delta_time = end - start
		ave_loss = np.array(total_loss).mean()
		logging(model,ave_loss,e,delta_time,mode='train')
		if ave_loss < 0.1:
                    checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                    model.saver.save(sess, checkpoint_path, global_step = e)
                    print("model has been saved in:"+str(checkpoint_path))
		    break
Exemplo n.º 27
0
    losses = unweighted_loss * class_weights
else:
    if args.loss_func == "cross_entropy":
        losses = tf.nn.softmax_cross_entropy_with_logits(logits=network,
                                                         labels=net_output)
    elif args.loss_func == "lovasz":
        losses = utils.lovasz_softmax(probas=network, labels=net_output)
loss = tf.reduce_mean(losses)

opt = tf.train.AdamOptimizer(0.0001).minimize(
    loss, var_list=[var for var in tf.trainable_variables()])

saver = tf.train.Saver(max_to_keep=1000)
sess.run(tf.global_variables_initializer())

utils.count_params()

# If a pre-trained ResNet is required, load the weights.
# This must be done AFTER the variables are initialized with sess.run(tf.global_variables_initializer())
if init_fn is not None:
    init_fn(sess)

# Load a previous checkpoint if desired
model_checkpoint_name = "checkpoints/latest_model_" + args.model + "_" + args.dataset + ".ckpt"
if args.continue_training or not args.mode == "train":
    print('Loaded latest model checkpoint')
    saver.restore(sess, model_checkpoint_name)

avg_scores_per_epoch = []

# Load the data
Exemplo n.º 28
0
        tf.summary.scalar("lambda", decayed_reg)
        summary_op = tf.summary.merge_all()

        name = str(lr) + '_' + str(bs) + '_' +  str(nn)
        train_writer = tf.summary.FileWriter(tensorboard_log + name + '/train/', graph = sess.graph)
        val_writer = tf.summary.FileWriter(tensorboard_log + name + '/val/')

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        # Load the pretrained weights into the non-trainable layer
        model_rgb.load_params(sess, params_dir_rgb, trainable=False)
        model_depth.load_params(sess, params_dir_depth, trainable=False)

        print("\nHyper-parameters: lr={}, #neurons={}, bs={}, l2={}, max_norm={}, dropout_rate={}".format(lr,nn,bs,aa,mn,do))     
        print("Number of trainable parameters = {}".format(count_params(trainable_variables_rnn)+count_params(trainable_variables_conv1x1)))    

        print("\n{} Generate features from training set".format(datetime.now()))
         
        tb_train_count=0        
        tb_val_count = 0

 
        # Loop over number of epochs
        num_samples = 0
        # Training set
        
        sess.run(training_init_op)

        # Progress bar setting
        bar = progressbar.ProgressBar(maxval=tr_batches_per_epoch, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
Exemplo n.º 29
0
 def test_count_params(self):
     linear = nn.Linear(123, 42)
     n_weights = 123 * 42
     n_bias = 42
     n_total = n_weights + n_bias
     self.assertEqual(n_total, count_params(linear))
def main():
    args = parse_args()

    if args.name is None:
        if args.deepsupervision:
            args.name = '%s_%s_wDS' % (args.dataset, args.arch)
        else:
            args.name = '%s_%s_woDS' % (args.dataset, args.arch)
    if not os.path.exists('models/%s' % args.name):
        os.makedirs('models/%s' % args.name)

    print('Config -----')
    for arg in vars(args):
        print('%s: %s' % (arg, getattr(args, arg)))
    print('------------')

    with open('models/%s/args.txt' % args.name, 'w') as f:
        for arg in vars(args):
            print('%s: %s' % (arg, getattr(args, arg)), file=f)

    joblib.dump(args, 'models/%s/args.pkl' % args.name)

    # define loss function (criterion)
    if args.loss == 'BCEWithLogitsLoss':
        criterion = nn.BCEWithLogitsLoss().cuda()
    else:
        criterion = losses.__dict__[args.loss]().cuda()

    cudnn.benchmark = True

    DATA_PATH = '../../Datasets/'

    img_paths = []
    mask_paths = []
    for class_folder in os.listdir(DATA_PATH):
        FOLDER_PATH = os.path.join(DATA_PATH, class_folder)
        for patient_folder in os.listdir(FOLDER_PATH):
            patient_folder = os.path.join(FOLDER_PATH, patient_folder)
            if os.path.isdir(patient_folder):
                if (os.path.isfile(
                        os.path.join(patient_folder, 'LAT/Lat_Vertebra.png'))):
                    mask_paths.append(
                        os.path.join(patient_folder, 'LAT/Lat_Vertebra.png'))
                    img_paths.append(os.path.join(patient_folder, "LAT.jpg"))

    c = list(zip(img_paths, mask_paths))

    random.shuffle(c)

    img_paths, mask_paths = zip(*c)
    img_paths = np.array(img_paths)
    mask_paths = np.array(mask_paths)

    k = 10
    kf = KFold(n_splits=k)
    fold_num = 0
    mean_ious = []
    for train_index, test_index in kf.split(img_paths):
        train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = \
        train_test_split(img_paths[train_index], mask_paths[train_index], test_size=0.08, random_state=41)

        # create model
        print("=> creating model %s for fold %s" % (args.arch, fold_num))
        fold_num += 1
        model = archs.__dict__[args.arch](args)

        model = model.cuda()

        print(count_params(model))

        if args.optimizer == 'Adam':
            optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          model.parameters()),
                                   lr=args.lr)
        elif args.optimizer == 'SGD':
            optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                         model.parameters()),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay,
                                  nesterov=args.nesterov)

        train_dataset = Dataset(args, train_img_paths, train_mask_paths,
                                args.aug)
        val_dataset = Dataset(args, val_img_paths, val_mask_paths)

        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   pin_memory=True,
                                                   drop_last=True)
        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 pin_memory=True,
                                                 drop_last=False)

        log = pd.DataFrame(
            index=[],
            columns=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

        best_iou = 0
        trigger = 0
        for epoch in range(args.epochs):
            print('Epoch [%d/%d]' % (epoch, args.epochs))

            # train for one epoch
            train_log = train(args, train_loader, model, criterion, optimizer,
                              epoch)
            # evaluate on validation set
            val_log = validate(args, val_loader, model, criterion)

            print('loss %.4f - iou %.4f - val_loss %.4f - val_iou %.4f' %
                  (train_log['loss'], train_log['iou'], val_log['loss'],
                   val_log['iou']))

            tmp = pd.Series(
                [
                    epoch,
                    args.lr,
                    train_log['loss'],
                    train_log['iou'],
                    val_log['loss'],
                    val_log['iou'],
                ],
                index=['epoch', 'lr', 'loss', 'iou', 'val_loss', 'val_iou'])

            log = log.append(tmp, ignore_index=True)
            log.to_csv('models/%s/log.csv' % args.name, index=False)

            trigger += 1

            if val_log['iou'] > best_iou:
                torch.save(model.state_dict(),
                           './models/%s/model.pth' % args.name)
                best_iou = val_log['iou']
                print("=> saved best model")
                trigger = 0

            # early stopping
            if not args.early_stop is None:
                if trigger >= args.early_stop:
                    print("=> early stopping")
                    break

            torch.cuda.empty_cache()

        args = joblib.load('models/%s/args.pkl' % args.name)

        if not os.path.exists('output/%s' % args.name):
            os.makedirs('output/%s' % args.name)

        joblib.dump(args, 'models/%s/args.pkl' % args.name)

        # create model
        print("=> Testing model %s" % args.arch)
        model = archs.__dict__[args.arch](args)

        model = model.cuda()

        test_img_paths, test_mask_paths = img_paths[test_index], mask_paths[
            test_index]
        input_paths = test_img_paths

        model.load_state_dict(torch.load('models/%s/model.pth' % args.name))
        model.eval()

        test_dataset = Dataset(args, test_img_paths, test_mask_paths)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  pin_memory=True,
                                                  drop_last=False)

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            with torch.no_grad():
                for i, (input, target) in tqdm(enumerate(test_loader),
                                               total=len(test_loader)):
                    input = input.cuda()
                    target = target.cuda()

                    # compute output
                    if args.deepsupervision:
                        output = model(input)[-1]
                    else:
                        output = model(input)

                    output = torch.sigmoid(output).data.cpu().numpy()
                    test_img_paths = test_img_paths[args.batch_size *
                                                    i:args.batch_size *
                                                    (i + 1)]

                    imsave(
                        os.path.join("./output/%s" % args.name,
                                     str(i) + ".png"),
                        (output[0, 0, :, :] * 255).astype('uint8'))

            torch.cuda.empty_cache()

        # IoU
        ious = []
        for i in tqdm(range(len(test_mask_paths))):
            input_img = cv2.imread(input_paths[i], 1)[:, :, 0]
            input_img = cv2.resize(input_img, (256, 256))

            mask = np.zeros((256, 256))
            _mask = cv2.imread(test_mask_paths[i])[:, :, 0]
            _mask = cv2.resize(_mask, (256, 256))
            mask = np.maximum(mask, _mask)

            pb = imread('output/%s/' % args.name + str(i) + ".png")

            mask = mask.astype('float32') / 255
            pb = pb.astype('float32') / 255

            iou = iou_score(pb, mask)
            ious.append(iou)
        mean_ious.append(np.mean(ious))
        print("\n")
    print(mean_ious)
    print(np.mean(mean_ious))