示例#1
0
def main_merge():
    global args, best_corr

    args.store_name = '{}_merged'.format(args.model)
    args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M')
    args.start_epoch = 0

    check_rootfolders(args)

    model = Baseline(args.img_feat_size, args.au_feat_size)

    model = torch.nn.DataParallel(model).cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    if args.use_multistep:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, args.step_milestones, args.step_decay)
    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    if args.resume and os.path.isfile(args.resume):
        print('Load checkpoint:', args.resume)
        ckpt = torch.load(args.resume)
        args.start_epoch = ckpt['epoch']
        best_corr = ckpt['best_corr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print('Loaded ckpt at epoch:', args.start_epoch)

    # initialize datasets
    train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset(
        csv_path=[args.train_csv, args.val_csv],
        vidmap_path=[args.train_vidmap, args.val_vidmap],
        image_feat_path=args.image_features,
        audio_feat_path=args.audio_features,
        mode='merge'),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=False,
                                               drop_last=True)

    log_training = open(
        os.path.join(args.root_log, args.store_name, 'log.csv'), 'w')
    with open(os.path.join(args.root_log, args.store_name, 'args.txt'),
              'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        log_dir=os.path.join(args.root_log, args.store_name))
    for epoch in range(args.start_epoch, args.epochs):
        train(train_loader, model, optimizer, epoch, log_training, tb_writer)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_corr': 0.0,
            }, False)
        if args.use_multistep:
            scheduler.step()
示例#2
0
def train():
    # Prepare gym
    env = create_env()
    h, w, c = env.observation_space.shape

    # Prepare models
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_dir, fn = "./policy_grad", '{}.pth'
    model = Baseline(h, w).to(device)
    model.train()
    optimizer = optim.RMSprop(model.parameters(),
                              lr=LEARN_RATE,
                              weight_decay=WEIGHT_DECAY)

    # Train
    steps_done = 0
    num_episodes = 2000
    episode_rewards = []

    for i_episode in tqdm(range(num_episodes)):
        # Complete 1 episode
        print("Episode {}".format(i_episode + 1))
        i_rewards, i_states, i_actions, steps_done = generate_episode(
            env, model, device, steps_done, episode_rewards)

        # Update model
        optimize_model(device, model, optimizer, i_rewards, i_actions,
                       i_states)

        # Save model every couple episodes
        if (i_episode + 1) % SAVE_EPI == 0:
            path = os.path.join(model_dir, fn.format(episode_rewards[-1]))
            torch.save(model.state_dict(), path)

    print('Complete')
    np.save('./rewards_policy_grad.npy', episode_rewards)

    env.close()
    plt.ioff()
    plt.show()
示例#3
0
    device = args.device

    if args.resnet:
        assert args.input_size == 224
        #model = Resnet(args.output_size)
        print('!!!!!!!!!!!!!!!!efficientnet load!!!!!!!!!!!!!!!!')
        model_name = 'efficientnet-b0'
        print(model_name)

        model = EfficientNet.from_name(model_name)

        #model = EfficientNet.from_pretrained(model_name, num_classes=350)
        #summary(model,input_size=(3,224,224))
    else:
        model = Baseline(args.hidden_size, args.output_size)
    optimizer = optim.Adam(model.parameters(), args.learning_rate)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        patience=1,
                                                        verbose=True)
    criterion = nn.CrossEntropyLoss()  #multi-class classification task

    model = model.to(device)
    model.train()

    # DONOTCHANGE: They are reserved for nsml
    bind_model(model)
    # below the nsml load
    nsml.load(checkpoint='15', session='team_62/airush1/40')
    nsml.save('stillgoing')

    if args.pause:
示例#4
0
def train(args):
    start_time = time.time()
    device = torch.device('cuda' if args.cuda else 'cpu')

    pprint(args.__dict__)
    interface = FileInterface(**args.__dict__)
    piqa_model = Baseline(**args.__dict__).to(device)

    loss_model = Loss().to(device)
    optimizer = torch.optim.Adam(p for p in piqa_model.parameters()
                                 if p.requires_grad)

    batch_size = args.batch_size
    char_vocab_size = args.char_vocab_size
    glove_vocab_size = args.glove_vocab_size
    word_vocab_size = args.word_vocab_size
    glove_size = args.glove_size
    elmo = args.elmo
    draft = args.draft

    def preprocess(interface_):
        # get data
        print('Loading train and dev data')
        train_examples = load_squad(interface_.train_path, draft=draft)
        dev_examples = load_squad(interface_.test_path, draft=draft)

        # iff creating processor
        print('Loading GloVe')
        glove_words, glove_emb_mat = load_glove(
            glove_size,
            vocab_size=args.glove_vocab_size - 2,
            glove_dir=interface_.glove_dir,
            draft=draft)

        print('Constructing processor')
        processor = SquadProcessor(char_vocab_size,
                                   glove_vocab_size,
                                   word_vocab_size,
                                   elmo=elmo)
        processor.construct(train_examples, glove_words)

        # data loader
        print('Preprocessing datasets')
        train_dataset = tuple(
            processor.preprocess(example) for example in train_examples)
        dev_dataset = tuple(
            processor.preprocess(example) for example in dev_examples)

        print('Creating data loaders')
        train_sampler = SquadSampler(train_dataset,
                                     max_context_size=256,
                                     max_question_size=32,
                                     bucket=True,
                                     shuffle=True)
        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  collate_fn=processor.collate,
                                  sampler=train_sampler)

        dev_sampler = SquadSampler(dev_dataset, bucket=True)
        dev_loader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                collate_fn=processor.collate,
                                sampler=dev_sampler)

        if args.preload:
            train_loader = tuple(train_loader)
            dev_loader = tuple(dev_loader)

        out = {
            'glove_emb_mat': glove_emb_mat,
            'processor': processor,
            'train_dataset': train_dataset,
            'dev_dataset': dev_dataset,
            'train_loader': train_loader,
            'dev_loader': dev_loader
        }

        return out

    out = interface.cache(
        preprocess,
        interface_=interface) if args.cache else preprocess(interface)
    glove_emb_mat = out['glove_emb_mat']
    processor = out['processor']
    train_dataset = out['train_dataset']
    dev_dataset = out['dev_dataset']
    train_loader = out['train_loader']
    dev_loader = out['dev_loader']

    print("Initializing model weights")
    piqa_model.load_glove(torch.tensor(glove_emb_mat))

    bind_model(interface, processor, piqa_model, optimizer=optimizer)

    step = 0
    best_report = None

    print('Training')
    piqa_model.train()
    for epoch_idx in range(args.epochs):
        for i, train_batch in enumerate(train_loader):
            train_batch = {
                key: val.to(device)
                for key, val in train_batch.items()
            }
            model_output = piqa_model(step=step, **train_batch)
            train_results = processor.postprocess_batch(
                train_dataset, train_batch, model_output)
            train_loss = loss_model(step=step, **model_output, **train_batch)
            train_f1 = float(
                np.mean([result['f1'] for result in train_results]))
            train_em = float(
                np.mean([result['em'] for result in train_results]))

            # optimize
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            step += 1

            # report & eval & save
            if step % args.report_period == 1:
                report = OrderedDict(step=step,
                                     train_loss=train_loss.item(),
                                     train_f1=train_f1,
                                     train_em=train_em,
                                     time=time.time() - start_time)
                interface.report(**report)
                print(', '.join('%s=%.5r' % (s, r) for s, r in report.items()))

            if step % args.eval_save_period == 1:
                with torch.no_grad():
                    piqa_model.eval()
                    loss_model.eval()
                    pred = {}
                    dev_losses, dev_results = [], []
                    for dev_batch, _ in zip(dev_loader,
                                            range(args.eval_steps)):
                        dev_batch = {
                            key: val.to(device)
                            for key, val in dev_batch.items()
                        }
                        model_output = piqa_model(**dev_batch)
                        results = processor.postprocess_batch(
                            dev_dataset, dev_batch, model_output)

                        dev_loss = loss_model(step=step,
                                              **dev_batch,
                                              **model_output)

                        for result in results:
                            pred[result['id']] = result['pred']
                        dev_results.extend(results)
                        dev_losses.append(dev_loss.item())

                    dev_loss = float(np.mean(dev_losses))
                    dev_f1 = float(
                        np.mean([result['f1'] for result in dev_results]))
                    dev_em = float(
                        np.mean([result['em'] for result in dev_results]))

                    report = OrderedDict(step=step,
                                         dev_loss=dev_loss,
                                         dev_f1=dev_f1,
                                         dev_em=dev_em,
                                         time=time.time() - start_time)
                    summary = False
                    if best_report is None or report['dev_f1'] > best_report[
                            'dev_f1']:
                        best_report = report
                        summary = True
                        interface.save(iteration=step)
                        interface.pred(pred)
                    interface.report(summary=summary, **report)
                    print(
                        ', '.join('%s=%.5r' % (s, r)
                                  for s, r in report.items()),
                        '(dev_f1_best=%.5r @%d)' %
                        (best_report['dev_f1'], best_report['step']))
                    piqa_model.train()
                    loss_model.train()

            if step == args.train_steps:
                break
        if step == args.train_steps:
            break
示例#5
0
def main_train(config, checkpoint_dir=None):
    global args, best_corr
    best_corr = 0.0

    args.store_name = '{}'.format(args.model)
    args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M-%S')
    args.start_epoch = 0

    # check_rootfolders(args)
    if args.model == 'Baseline':
        model = Baseline()
    elif args.model == 'TCFPN':
        model = TCFPN(layers=[48, 64, 96], in_channels=(2048 + 128), num_classes=15, kernel_size=11)
    
    model = torch.nn.DataParallel(model).cuda()

    if config['optimizer'] == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    elif config['optimizer'] == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
    
    # custom optimizer
    if args.use_sam:
        base_optim = torch.optim.Adam
        optimizer = SAM(model.parameters(), base_optim, lr=config['lr'])
    # custom lr scheduler
    if args.use_cos_wr:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=args.cos_wr_t0,T_mult=args.cos_wr_t_mult)
    elif args.use_cos:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.cos_t_max)
    # SWA
    if args.use_swa:
        swa_model = torch.optim.swa_utils.AveragedModel(model)
        swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=config['lr'])

    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    # if args.resume and os.path.isfile(args.resume):
    #     print('Load checkpoint:', args.resume)
    #     ckpt = torch.load(args.resume)
    #     args.start_epoch = ckpt['epoch']
    #     best_corr = ckpt['best_corr']
    #     model.load_state_dict(ckpt['state_dict'])
    #     optimizer.load_state_dict(ckpt['optimizer'])
    #     print('Loaded ckpt at epoch:', args.start_epoch)
    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)


    # initialize datasets
    train_loader = torch.utils.data.DataLoader(
        dataset=EEV_Dataset(
            csv_path=args.train_csv,
            vidmap_path=args.train_vidmap,
            image_feat_path=args.image_features,
            audio_feat_path=args.audio_features,
            mode='train', lpfilter=args.lp_filter
        ),
        batch_size=config['batch_size'], shuffle=True,
        num_workers=args.workers, pin_memory=False,
        drop_last=True
    )

    val_loader = torch.utils.data.DataLoader(
        dataset=EEV_Dataset(
            csv_path=args.val_csv,
            vidmap_path=args.val_vidmap,
            image_feat_path=args.image_features,
            audio_feat_path=args.audio_features,
            mode='val'
        ),
        batch_size=None, shuffle=False,
        num_workers=args.workers, pin_memory=False
    )

    accuracy = correlation
    # with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f:
    #     f.write(str(args))
    
    # tb_writer = SummaryWriter(log_dir=os.path.join(args.root_log, args.store_name))

    for epoch in range(args.start_epoch, args.epochs):
        # train
        train(train_loader, model, optimizer, epoch, None, None)
        # do lr scheduling after epoch
        if args.use_swa and epoch >= args.swa_start:
            print('swa stepping...')
            swa_model.update_parameters(model)
            swa_scheduler.step()
        elif args.use_cos_wr:
            print('cos warm restart (T0:{} Tm:{}) stepping...'.format(args.cos_wr_t0, args.cos_wr_t_mult))
            scheduler.step()
        elif args.use_cos:
            print('cos (Tmax:{}) stepping...'.format(args.cos_t_max))
            scheduler.step()
        
        # validate
        if args.use_swa and epoch >= args.swa_start:
            # validate use swa model
            corr, loss = validate(val_loader, swa_model, accuracy, epoch, None, None)
        else:
            corr, loss = validate(val_loader, model, accuracy, epoch, None, None)
        is_best = corr > best_corr
        best_corr = max(corr, best_corr)
        # tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch)
        # output_best = 'Best corr: %.4f\n' % (best_corr)
        # print(output_best)
        # save_checkpoint({
        #     'epoch': epoch + 1,
        #     'state_dict': model.state_dict(),
        #     'optimizer': optimizer.state_dict(),
        #     'best_corr': best_corr,
        # }, is_best)
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            if is_best:
                path = os.path.join(checkpoint_dir, "checkpoint_best")
            torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(loss=loss, accuracy=corr, best_corr=best_corr)
示例#6
0
def main_train():
    global args, best_corr

    args.store_name = '{}'.format(args.model)
    args.store_name = args.store_name + datetime.now().strftime(
        '_%m-%d_%H-%M-%S')
    args.start_epoch = 0

    if not args.val_only:
        check_rootfolders(args)
    if args.model == 'Baseline':
        if args.cls_indices:
            model = Baseline(args.img_feat_size,
                             args.au_feat_size,
                             num_classes=len(args.cls_indices))
        else:
            print('Feature size:', args.img_feat_size, args.au_feat_size)
            model = Baseline(args.img_feat_size, args.au_feat_size)
    elif args.model == 'TCFPN':
        model = TCFPN(layers=[48, 64, 96],
                      in_channels=(128),
                      num_classes=15,
                      kernel_size=11)
    elif args.model == 'BaseAu':
        model = Baseline_Au(args.au_feat_size)
    elif args.model == 'BaseImg':
        model = Baseline_Img(args.img_feat_size)
    elif args.model == 'EmoBase':
        model = EmoBase()

    model = torch.nn.DataParallel(model).cuda()

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)
    # optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate)
    # custom optimizer
    if args.use_sam:
        base_optim = torch.optim.Adam
        optimizer = SAM(model.parameters(), base_optim, lr=args.learning_rate)
    # custom lr scheduler
    if args.use_cos_wr:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=args.cos_wr_t0, T_mult=args.cos_wr_t_mult)
    elif args.use_cos:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.cos_t_max)
    elif args.use_multistep:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, args.step_milestones, args.step_decay)
    # SWA
    if args.use_swa:
        swa_model = torch.optim.swa_utils.AveragedModel(model)
        swa_scheduler = torch.optim.swa_utils.SWALR(optimizer,
                                                    swa_lr=args.learning_rate)

    # ckpt structure {epoch, state_dict, optimizer, best_corr}
    if args.resume and os.path.isfile(args.resume):
        print('Load checkpoint:', args.resume)
        ckpt = torch.load(args.resume)
        args.start_epoch = ckpt['epoch']
        best_corr = ckpt['best_corr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print('Loaded ckpt at epoch:', args.start_epoch)

    # initialize datasets
    train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset(
        csv_path=args.train_csv,
        vidmap_path=args.train_vidmap,
        image_feat_path=args.image_features,
        audio_feat_path=args.audio_features,
        mode='train',
        lpfilter=args.lp_filter,
        train_freq=args.train_freq,
        val_freq=args.val_freq,
        cls_indices=args.cls_indices),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=False,
                                               drop_last=True)

    val_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset(
        csv_path=args.val_csv,
        vidmap_path=args.val_vidmap,
        image_feat_path=args.image_features,
        audio_feat_path=args.audio_features,
        mode='val',
        train_freq=args.train_freq,
        val_freq=args.val_freq,
        cls_indices=args.cls_indices,
        repeat_sample=args.repeat_sample),
                                             batch_size=None,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=False)

    accuracy = correlation

    if args.val_only:
        print('Run validation ...')
        print('start epoch:', args.start_epoch, 'model:', args.resume)
        validate(val_loader, model, accuracy, args.start_epoch, None, None)
        return

    log_training = open(
        os.path.join(args.root_log, args.store_name, 'log.csv'), 'w')
    with open(os.path.join(args.root_log, args.store_name, 'args.txt'),
              'w') as f:
        f.write(str(args))

    tb_writer = SummaryWriter(
        log_dir=os.path.join(args.root_log, args.store_name))
    for epoch in range(args.start_epoch, args.epochs):
        train(train_loader, model, optimizer, epoch, log_training, tb_writer)
        # do lr scheduling after epoch
        if args.use_swa and epoch >= args.swa_start:
            print('swa stepping...')
            swa_model.update_parameters(model)
            swa_scheduler.step()
        elif args.use_cos_wr or args.use_cos or args.use_multistep:
            scheduler.step()

        if (epoch + 1) > 2 and ((epoch + 1) % args.eval_freq == 0 or
                                (epoch + 1) == args.epochs):
            # validate
            if args.use_swa and epoch >= args.swa_start:
                # validate use swa model
                corr = validate(val_loader, swa_model, accuracy, epoch,
                                log_training, tb_writer)
            else:
                corr = validate(val_loader, model, accuracy, epoch,
                                log_training, tb_writer)
            is_best = corr > best_corr
            best_corr = max(corr, best_corr)
            tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch)
            output_best = 'Best corr: %.4f\n' % (best_corr)
            print(output_best)
            log_training.write(output_best + '\n')
            log_training.flush()

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'best_corr': best_corr,
                }, is_best)
示例#7
0
    elif args.model == 'mobilenet':
        assert args.input_size == 158
        model = torch.hub.load('pytorch/vision',
                               'mobilenet_v2',
                               pretrained=True)
        model.classifier = nn.Sequential(nn.Dropout(0.2),
                                         nn.Linear(1280, args.num_classes))
    else:
        raise NotImplementedError

    if use_gpu:
        model = model.to(device)

    if args.optimizer == 'Adam':
        # optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=0.00025)
        optimizer = AdamW(model.parameters(), args.lr, weight_decay=0.000025)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(model.parameters(),
                              args.lr,
                              momentum=0.9,
                              weight_decay=0.025)
    else:
        raise NotImplementedError

    if args.scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5)
    elif args.scheduler == 'cosine':
        eta_min = 1e-5
        T_max = 10
        T_mult = 1
        restart_decay = 0.97
示例#8
0
model_path = 'model_mobilefacenet.pth'

# model_name = 'MiniXception'
# model_path = ' '

# model_name = 'ConvNet'
# model_path = ' '

# model_name = 'MixNet'
# model_path = ' '

model = Baseline(model='train', model_name=model_name, model_path=model_path)
#model.load_param('models/model_1_180000.pth')
model = model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# kd_id = 0
# kd_num = 7
# batch_size = 48
# instance_num = 1
train_data, val_data, trains, vals = make_dataloader(kd_id, kd_num)
train_loader = DataLoader(dataset=train_data,
                          batch_size=batch_size,
                          sampler=RandomSampler(trains, batch_size,
                                                instance_num),
                          shuffle=False,
                          num_workers=2,
                          collate_fn=train_collate)
#train_loader = DataLoader(dataset=train_data, batch_size=48, shuffle=False, num_workers=2, collate_fn=train_collate)
    elif args.model == "Resnet152":
        model = Resnet152(args.output_size)
    elif args.model == "Resnext101":
        model = Resnext101(args.output_size)
    elif args.model == "baseline":
        model = Baseline(args.hidden_size, args.output_size)
    elif args.model == "WideResnet101":
        model = WideResnet101(args.output_size)
    elif args.model.split("-")[0] == "efficientnet":
        model = EfficientNet.from_pretrained(args.model, args.output_size)
    else:
        raise Exception("model type is invalid : " + args.model)

    if args.mode == "train":
        if args.optimizer == "adam":
            optimizer = optim.Adam(model.parameters(),
                                   args.learning_rate,
                                   weight_decay=args.weight_decay)
        elif args.optimizer == "sgd":
            optimizer = optim.SGD(model.parameters(),
                                  lr=args.learning_rate,
                                  momentum=0.9,
                                  weight_decay=args.weight_decay,
                                  nesterov=args.nesterov)
        # elif args.optimizer == "adabound":
        #     optimizer = adabound.AdaBound(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
        elif args.optimizer == "adamw":
            optimizer = optim.AdamW(model.parameters(),
                                    args.learning_rate,
                                    weight_decay=args.weight_decay)
        else: