def create_session(self, improve_by=5, min_epoch=10):
        self.objs['saver'] = tf.train.Saver()
        # self.objs['sess'] = tf.Session(config = self.session_config)
        self.objs['sess'] = tf.InteractiveSession()
        self.objs['sess'].run(tf.global_variables_initializer())
        self.objs['es'] = utils.EarlyStopping(self.objs['sess'],
                                              self.objs['saver'],
                                              save_dir="saved_seed%d" %
                                              self.seed,
                                              improve_by=improve_by,
                                              min_epoch=min_epoch)

        if self.feature_extractor_needed:
            if not os.path.exists("vgg16_cifar100"):
                print("Pretrained model doesnt exist for VGG16")
                print("Run cifar100.py first")
                exit(0)
            else:
                reqd_variables = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope="feature_extractor")
                feature_extractor_saver = tf.train.Saver(reqd_variables)
                print("Restoring feature extractor variables")
                feature_extractor_saver.restore(self.objs['sess'],
                                                "vgg16_cifar100/saved.ckpt")
                print("Done")
Пример #2
0
 def optimize(self):
     """
     Train the network. For each iteration, call the optimization loop function.
     """
     print(colored('starting optimization with ADAM...', 'cyan'))
     self.optimizer = torch.optim.Adam(self.parameters, lr=self.args.lr)
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='min',
                                                            factor=self.args.lr_factor,
                                                            threshold=self.args.lr_thresh,
                                                            patience=self.args.lr_patience)
     # stop after no improvements greater than a certain percentage of the previous loss
     stopper = u.EarlyStopping(patience=self.args.earlystop_patience,
                               min_delta=self.args.earlystop_min_delta,
                               percentage=True)
     start = time()
     for j in range(self.args.epochs):
         self.optimizer.zero_grad()
         loss = self.optimization_loop()
         self.optimizer.step()
         if self.args.reduce_lr:
             scheduler.step(loss)
         if stopper.step(loss):  # stopper is computed on loss, as we don't have any validation metrics
             break
     
     self.elapsed = time() - start
     print(colored(u.sec2time(self.elapsed), 'yellow'))
Пример #3
0
def run(fold, model_name):
    writer = SummaryWriter(log_dir=f'{SAVE_PATH}/', filename_suffix=f'{model_name}-fold{fold}')
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    print(df_train.shape)
    print(df_valid.shape)
    train_dataset = dataset.TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = dataset.TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
    print(f'training on {device}')
    model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(params.optimizer_params(model), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)

    es = utils.EarlyStopping(patience=5, mode="max")
    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler, writer)
        jaccard = engine.eval_fn(valid_data_loader, model, device, writer)
        print(f"Jaccard Score = {jaccard}")
        print(f"Epoch={epoch}, Jaccard={jaccard}")
        es(jaccard, model, model_path=f"{SAVE_PATH}/{model_name}-f{fold}.pt")
        if es.early_stop:
            print("Early stopping")
            break
Пример #4
0
 def __init__(self, model: Any, model_name: str = None):
     super().__init__(model)
     self.model_name = model_name
     self.device = xm.xla_device()
     self.optimizer = transformers.AdamW(self.model.parameters(),
                                         lr=1e-4 * xm.xrt_world_size())
     self.criterion = nn.BCEWithLogitsLoss()
     self.early_stopping = utils.EarlyStopping(patience=5, verbose=True)
     self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         self.optimizer, mode="max", patience=5, factor=0.3, verbose=True)
Пример #5
0
def run_trial(params, grid_logger=None, grid_logger_avg=None, cuda_id=0):
    if not params["is_nni"]:
        results_logger = utils.Results(grid_logger, grid_logger_avg, params)

    for it in range(params['iterations']):
        print("Starting Trial")
        print(params)

        dataset = params['loader']
        model = ModelRunner(params, dataset, cuda_device=cuda_id)
        model.architecture()

        early_stopping = utils.EarlyStopping(
            patience=params['early_stopping_patience'], verbose=True)

        for epoch in range(int(params['epochs'])):

            train_results = model.train(epoch)  # train
            valid_results = model.validation(epoch)  # validation

            if params["is_nni"]:
                if epoch % 1 == 0:
                    nni.report_intermediate_result(
                        train_results["f1_score_macro"])

            if not params['is_nni']:
                results_logger.insert_scores(train_results=train_results,
                                             valid_results=valid_results)
                utils.print_log_data(train_results=train_results,
                                     valid_results=valid_results,
                                     epoch=epoch)

            if epoch == int(params['epochs']) - 1:
                test_results, best_epoch = model.test()  # test
                if params["is_nni"]:
                    nni.report_final_result(valid_results["f1_score_macro"])

                else:
                    results_logger.insert_scores(test_results=test_results)
                    utils.print_log_data(train_results=train_results,
                                         valid_results=valid_results,
                                         epoch=epoch)
                    results_logger.write_log(it, best_epoch)
                    results_logger.write_avg_log()

            valid_loss = valid_results['loss'] + valid_results['tempo_loss']
            early_stopping(valid_loss, model.net)
            if early_stopping.early_stop:
                print(f"Early stopping, epoch:{epoch}")
                break

        print("done")
Пример #6
0
def run():
    seed_everything(config.SEED)
    df_train = pd.read_csv(
        config.TRAINING_FILE).dropna().reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")

    for epoch in range(EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        if epoch + 1 == MAX_EPOCHS:
            torch.save(model.state_dict(), 'model_full.bin')
            break
Пример #7
0
def run(train_path,
        dev_path,
        batch_size,
        device,
        epochs=50,
        path="weights/model.bin"):

    # Build model
    print('Building model ...')
    net = LSTM_divider(consts.voc_size)
    net.to(device)
    print('Done!')

    print('Building dataset ...')
    train_dataset = utils.Dataset(train_path, consts.CHAR2IDX)
    val_dataset = utils.Dataset(dev_path, consts.CHAR2IDX)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=utils.make_batch)
    val_data_loader = torch.utils.data.DataLoader(val_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False,
                                                  collate_fn=utils.make_batch)
    print('Done!')

    # Class for Early Stopping
    es = utils.EarlyStopping()

    for epoch in range(1, epochs + 1):
        print(f'epoch {epoch}')
        train_fn(net, train_data_loader, device)
        precision = valid_fn(net, val_data_loader, device)

        # If score is not improved during certain term,
        # stop running
        if es.update(precision):
            print(f'Score has not been improved for {es.max_patient} epochs')
            print(f'Best precision -> {es.best}')
            torch.save(net.state_dict(), path)
            return
    def load_model_weight_continue_train(self):
        self.build_and_set_model()
        assert os.path.exists("cv_model.pth")
        self.model_ft.load_state_dict(torch.load("cv_model.pth"))
        self.model_ft.eval()

        patience = 3
        if self.submit_run:
            patience = 0
        es = utils.EarlyStopping(
            patience=patience
        )  # the first time it become worse, if patience set to 1
        for epoch in range(8, 13):
            train_loss = self.train_one_epoch(
                self.model_ft,
                self.optimizer,
                self.data_loader,
                self.device,
                epoch,
                self.metric_logger,
                print_freq=100,
            )
            print(f"train_loss (averaged) is {train_loss}")
            self.lr_scheduler.step()  # change learning rate

            if not self.submit_run:
                metric = self.eval_model_loss(
                    self.model_ft,
                    self.data_loader_dev,
                    self.device,
                    self.metric_logger,
                    print_freq=100,
                )
                print(f"metric (averaged) is {metric}")
                if es.step(metric):
                    print(
                        f"{epoch+1} epochs run and early stop, with patience {patience}"
                    )
                    break
    def train_model(self):
        patience = 3
        if self.submit_run:
            patience = 0
        es = utils.EarlyStopping(
            patience=patience
        )  # the first time it become worse, if patience set to 1

        for epoch in range(self.num_epochs):
            train_loss = self.train_one_epoch(
                self.model_ft,
                self.optimizer,
                self.data_loader,
                self.device,
                epoch,
                self.metric_logger,
                print_freq=10,
                mq_logger=self.logger,
            )
            self.metric_logger.print_and_log_to_file(
                f"train_loss (averaged) is {train_loss}")
            self.lr_scheduler.step()  # change learning rate

            if not self.submit_run:
                metric = self.eval_model_loss(
                    self.model_ft,
                    self.data_loader_dev,
                    self.device,
                    self.metric_logger,
                    print_freq=10,
                )
                self.metric_logger.print_and_log_to_file(
                    f"\nmetric (averaged) is {metric}\n")
                if es.step(metric):
                    self.print_log(
                        f"{epoch+1} epochs run and early stop, with patience {patience}"
                    )
                    break
def run(fold):
    """
    Train model for a speciied fold
    """
    # Read training csv
    dfx = pd.read_csv(TRAINING_FILE)
    
    # Set train validation set split
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    print('SIZE', len(df_train), len(df_valid))
    
    # Instantiate TweetDataset with training data
    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )
    
    # Instantiate DataLoader with `train_dataset`
    # This is a generator that yields the dataset in batches
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )
    
    # Instantiate TweetDataset with validation data
    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )
    
    # Instantiate DataLoader with `valid_dataset`
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=2
    )
    
    # Set device as `cuda` (GPU)
    device = torch.device("cuda")
    # Load pretrained RoBERTa
    model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH)
    # Output hidden states
    # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
    model_config.output_hidden_states = True
    # Instantiate our model with `model_config`
    model = TweetModel(conf=model_config)
    # Move the model to the GPU
    model.to(device)
    
    # Calculate the number of training steps
    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    # Get the list of named parameters
    param_optimizer = list(model.named_parameters())
    # Specify parameters where weight decay shouldn't be applied
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Define two sets of parameters: those with weight decay, and those without
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    # Create a scheduler to set the learning rate at each training step
    # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
    # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    
    # Apply early stopping with patience of 2
    # This means to stop training new epochs when 2 rounds have passed without any improvement
    es = utils.EarlyStopping(patience=2, mode="max")
    # es = EarlyStopping(patience=2)
    print(f"Training is Starting for fold={fold}")
    
    # I'm training only for 3 epochs even though I specified 5!!!
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path=SAVE_HEAD + str(fold) + '.bin')
        if es.early_stop:
            print("Early stopping")
            break
Пример #11
0
def train(config, fold, model_1, model_2, dict_loader, optimizer, scheduler, list_dir_save_model, dir_pyplot, Validation=True, Test_flag = True):

    train_loader = dict_loader['train']
    val_loader = dict_loader['val']
    test_loader = dict_loader['test']

    """ loss """
    # criterion_cls = nn.CrossEntropyLoss()
    # criterion_cls = ut.FocalLoss(gamma=st.focal_gamma, alpha=st.focal_alpha, size_average=True)
    criterion_cls = nn.BCELoss()
    # criterion = nn.L1Loss(reduction='mean').cuda()
    criterion = nn.MSELoss(reduction='mean').cuda()
    # criterion_gdl = gdl_loss(pNorm=2).cuda()

    EMS = ut.eval_metric_storage()
    list_selected_EMS = []
    list_ES = []
    for i_tmp in range(len(st.list_standard_eval_dir)):
        list_selected_EMS.append(ut.eval_selected_metirc_storage())
        list_ES.append(ut.EarlyStopping(delta=0, patience=st.early_stopping_patience, verbose=True))


    print('training')
    """ epoch """
    ut.model_freeze(model_2, requires_grad=False)
    num_data = len(train_loader.dataset)
    for epoch in range(config.num_epochs):
        epoch = epoch + 1 # increase the # of the epoch
        print(" ")
        print("---------------  epoch {} ----------------".format(epoch))
        torch.cuda.empty_cache()

        """ print learning rate """
        for param_group in optimizer.param_groups:
            print('current LR : {}'.format(param_group['lr']))

        """ batch """
        for i, data_batch in enumerate(train_loader):
            # start = time.time()
            model_1.train()
            model_2.eval()
            EMS.total_train_step += 1

            with torch.no_grad():
                """ input"""
                datas = Variable(data_batch['data'].float()).cuda()
                # labels = Variable(data_batch['label'].long()).cuda()
                labels = Variable(data_batch['label'].float()).cuda()

                """ minmax norm"""
                if st.list_data_norm_type[st.data_norm_type_num] == 'minmax':
                    tmp_datas = datas.view(datas.size(0), -1)
                    tmp_datas -= tmp_datas.min(1, keepdim=True)[0]
                    tmp_datas /= tmp_datas.max(1, keepdim=True)[0]
                    datas = tmp_datas.view_as(datas)

                """ data augmentation """
                ##TODO : flip
                # flip_flag_list = np.random.normal(size=datas.shape[0])>0
                # datas[flip_flag_list] = datas[flip_flag_list].flip(-3)

                ##TODO : translation, cropping
                dict_result = ut.data_augmentation(datas=datas, cur_epoch=epoch)
                datas = dict_result['datas']
                # aug_dict_result = ut.data_augmentation(datas=aug_datas, cur_epoch=epoch)
                # aug_datas = aug_dict_result['datas']

                """ gaussain noise """
                # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.tensor([0.01]))
                # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.FloatTensor(1).uniform_(0, 0.01))
                # Gaussian_noise = Gaussian_dist.sample(datas.size()).squeeze(-1)
                # datas = datas + Gaussian_noise.cuda()

                """ model 1 forward """
                dict_result = model_2(datas)
                output_3 = dict_result['logitMap']

            """ forward propagation """
            dict_result = model_1(output_3.detach())
            output_1 = dict_result['logits']
            output_2 = dict_result['Aux_logits']
            output_3 = dict_result['logitMap']

            """ classification """
            loss_list_1 = []

            loss_2 = criterion_cls(output_1, labels)
            loss_list_1.append(loss_2)
            EMS.train_aux_loss_1.append(loss_2.data.cpu().numpy())
            loss = sum(loss_list_1)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            """ print the train loss and tensorboard"""
            if (EMS.total_train_step) % 10 == 0 :
                # print('time : ', time.time() - start)
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                      %(epoch, config.num_epochs, i + 1, (round(num_data / config.batch_size)), loss.data.cpu().numpy()))

            torch.cuda.empty_cache()
            """ pyplot """
            EMS.train_loss.append(loss.data.cpu().numpy())
            EMS.train_step.append(EMS.total_train_step)


        """ val """
        if Validation == True:
            print("------------------  val  --------------------------")
            dict_result = ut.eval_classification_model_2(config, fold, val_loader, model_1, model_2, criterion_cls)
            val_loss = dict_result['Loss']
            acc = dict_result['Acc']
            auc = dict_result['AUC']
            print('Fold : %d, Epoch [%d/%d] val Loss = %f val Acc = %f' % (fold, epoch, config.num_epochs, val_loss, acc))
            torch.cuda.empty_cache()

            """ save the metric """
            EMS.dict_val_metric['val_loss'].append(val_loss)
            EMS.dict_val_metric['val_acc'].append(acc)
            EMS.dict_val_metric['val_auc'].append(auc)
            EMS.val_step.append(EMS.total_train_step)

            """ save model """
            for i_tmp in range(len(list_selected_EMS)):
                save_flag = ut.model_save_through_validation(fold, epoch, EMS=EMS,
                                                             selected_EMS=list_selected_EMS[i_tmp],
                                                             ES=list_ES[i_tmp],
                                                             model=model_1,
                                                             dir_save_model=list_dir_save_model[i_tmp],
                                                             metric_1=st.list_standard_eval[i_tmp], metric_2='',
                                                             save_flag=False)



        if Test_flag== True:
            print("------------------  test _ test dataset  --------------------------")
            """ load data """
            dict_result = ut.eval_classification_model_2(config, fold, test_loader, model_1, model_2, criterion_cls)
            test_loss = dict_result['Loss']
            acc = dict_result['Acc']
            test_loss = dict_result['Loss']

            """ pyplot """
            EMS.test_acc.append(acc)
            EMS.test_loss.append(test_loss)
            EMS.test_step.append(EMS.total_train_step)

            print('number of test samples : {}'.format(len(test_loader.dataset)))
            print('Fold : %d, Epoch [%d/%d] test Loss = %f test Acc = %f' % (fold, epoch, config.num_epochs, test_loss, acc))
            torch.cuda.empty_cache()

        """ learning rate decay"""
        EMS.LR.append(optimizer.param_groups[0]['lr'])
        scheduler.step()
        # scheduler.step(val_loss)

        """ plot the chat """
        if epoch % 10 == 0:
            ut.plot_training_info_1(fold, dir_pyplot, EMS,  flag='percentile', flag_match=False)

        ##TODO : early stop only if all of metric has been stopped
        tmp_count = 0
        for i in range(len(list_ES)):
            if list_ES[i].early_stop == True:
                tmp_count += 1
        if tmp_count == len(list_ES):
            break

    """ release the model """
    del model_1, EMS
    torch.cuda.empty_cache()
Пример #12
0
def train(model, train_data, val_data, args):
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = nn.BCEWithLogitsLoss()

    num_iterations_per_epoch = len(train_data) / args.batch_size
    val_eval_freq = int(args.val_evaluation_freq * num_iterations_per_epoch)
    print(
        f"Val set evaluated every {val_eval_freq:,} steps (approx. {args.val_evaluation_freq} epoch)"
    )

    es = utils.EarlyStopping(args.early_stopping_patience)
    initial_time = time.time()

    train_dataloader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  shuffle=True)

    global_step = 0
    epoch_no = 0
    while True:
        print(f"EPOCH #{epoch_no+1}")

        # Train single epoch
        for batch in train_dataloader:
            headlines, headline_lengths, bodys, para_lengths, labels = tuple(
                b.to(device) for b in batch)
            optimizer.zero_grad()

            preds = model(headlines, headline_lengths, bodys, para_lengths)
            loss = criterion(preds, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()

            global_step += 1
            print("globstep:", global_step)
            if global_step % val_eval_freq == 0:
                # Evaluate on validation set
                val_loss, val_acc, val_auc = evaluate(model, val_data)
                model.train()

                end_time = time.time()
                minutes_elapsed = int((end_time - initial_time) / 60)
                print(
                    "STEP: {:7} | TIME: {:4}min | VAL LOSS: {:.4f} | VAL ACC: {:.4f} | VAL AUROC: {:.4f}"
                    .format(global_step, minutes_elapsed, val_loss, val_acc,
                            val_auc))

                # Check early stopping
                if global_step >= args.min_iterations:
                    es.record_loss(val_loss, model)

                if es.should_stop():
                    print(f"Early stopping at STEP: {global_step}...")
                    return

            if global_step == args.max_iterations:
                print(
                    f"Stopping after reaching max iterations({global_step})..."
                )
                return
        epoch_no += 1
Пример #13
0
def train(config,
          fold,
          model,
          dict_loader,
          optimizer,
          scheduler,
          list_dir_save_model,
          dir_pyplot,
          Validation=True,
          Test_flag=True):

    train_loader = dict_loader['train']
    val_loader = dict_loader['val']
    test_loader = dict_loader['test']
    """ loss """
    # criterion_cls = nn.CrossEntropyLoss()
    # criterion_cls = ut.FocalLoss(gamma=st.focal_gamma, alpha=st.focal_alpha, size_average=True)
    # kdloss = ut.KDLoss(4.0)
    criterion_KL = nn.KLDivLoss(reduction="sum")
    criterion_cls = nn.BCELoss()
    # criterion_L1 = nn.L1Loss(reduction='sum').cuda()
    # criterion_L2 = nn.MSELoss(reduction='mean').cuda()
    # criterion_gdl = gdl_loss(pNorm=2).cuda()

    EMS = ut.eval_metric_storage()
    list_selected_EMS = []
    list_ES = []
    for i_tmp in range(len(st.list_standard_eval_dir)):
        list_selected_EMS.append(ut.eval_selected_metirc_storage())
        list_ES.append(
            ut.EarlyStopping(delta=0,
                             patience=st.early_stopping_patience,
                             verbose=True))

    loss_tmp = [0] * 5
    loss_tmp_total = 0
    print('training')
    optimizer.zero_grad()
    optimizer.step()
    """ epoch """
    num_data = len(train_loader.dataset)
    for epoch in range(1, config.num_epochs + 1):
        scheduler.step()
        print(" ")
        print("---------------  epoch {} ----------------".format(epoch))
        """ print learning rate """
        for param_group in optimizer.param_groups:
            print('current LR : {}'.format(param_group['lr']))
        """ batch """
        for i, data_batch in enumerate(train_loader):
            # start = time.time()
            model.train()
            with torch.no_grad():
                """ input"""
                datas = Variable(data_batch['data'].float()).cuda()
                # labels = Variable(data_batch['label'].long()).cuda()
                labels = Variable(data_batch['label'].float()).cuda()
                """ data augmentation """
                ##TODO : flip
                # flip_flag_list = np.random.normal(size=datas.shape[0])>0
                # datas[flip_flag_list] = datas[flip_flag_list].flip(-3)

                ##TODO : translation, cropping
                dict_result = ut.data_augmentation(datas=datas,
                                                   cur_epoch=epoch)
                datas = dict_result['datas']
                translation_list = dict_result['translation_list']
                # aug_dict_result = ut.data_augmentation(datas=aug_datas, cur_epoch=epoch)
                # aug_datas = aug_dict_result['datas']
                """ minmax norm"""
                if st.list_data_norm_type[st.data_norm_type_num] == 'minmax':
                    tmp_datas = datas.view(datas.size(0), -1)
                    tmp_datas -= tmp_datas.min(1, keepdim=True)[0]
                    tmp_datas /= tmp_datas.max(1, keepdim=True)[0]
                    datas = tmp_datas.view_as(datas)
                """ gaussain noise """
                # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.tensor([0.01]))
                # Gaussian_dist = torch.distributions.normal.Normal(loc=torch.tensor([0.0]), scale=torch.FloatTensor(1).uniform_(0, 0.01))
                # Gaussian_noise = Gaussian_dist.sample(datas.size()).squeeze(-1)
                # datas = datas + Gaussian_noise.cuda()
            """ forward propagation """
            dict_result = model(datas, translation_list)
            output_1 = dict_result['logits']
            output_2 = dict_result['Aux_logits']
            output_3 = dict_result['logitMap']
            output_4 = dict_result['l1_norm']

            #
            loss_list_1 = []
            count_loss = 0
            if fst.flag_loss_1 == True:
                s_labels = ut.smooth_one_hot(labels,
                                             config.num_classes,
                                             smoothing=st.smoothing_img)
                loss_2 = criterion_cls(
                    output_1,
                    s_labels) * st.lambda_major[0] / st.iter_to_update
                loss_list_1.append(loss_2)
                loss_tmp[count_loss] += loss_2.data.cpu().numpy()
                if (EMS.total_train_iter + 1) % st.iter_to_update == 0:
                    EMS.train_aux_loss[count_loss].append(loss_tmp[count_loss])
                    loss_tmp[count_loss] = 0
                count_loss += 1

            if fst.flag_loss_2 == True:
                for i_tmp in range(len(output_2)):
                    s_labels = ut.smooth_one_hot(labels,
                                                 config.num_classes,
                                                 smoothing=st.smoothing_roi)
                    loss_2 = criterion_cls(
                        output_2[i_tmp],
                        s_labels) * st.lambda_aux[i_tmp] / st.iter_to_update
                    loss_list_1.append(loss_2)

                    loss_tmp[count_loss] += loss_2.data.cpu().numpy()
                    if (EMS.total_train_iter + 1) % st.iter_to_update == 0:
                        EMS.train_aux_loss[count_loss].append(
                            loss_tmp[count_loss])
                        loss_tmp[count_loss] = 0
                    count_loss += 1

            if fst.flag_loss_3 == True:
                # patch

                list_loss_tmp = []
                for tmp_j in range(len(output_4)):  # type i.e., patch, roi
                    loss_2 = 0
                    for tmp_i in range(len(output_4[tmp_j])):  # batch
                        tmp_shape = output_4[tmp_j][tmp_i].shape
                        logits = output_4[tmp_j][tmp_i].view(
                            tmp_shape[0], tmp_shape[1], -1)
                        # loss_2 += torch.norm(logits, p=1)
                        loss_2 += torch.norm(logits,
                                             p=1) / (logits.view(-1).size(0))
                    list_loss_tmp.append(
                        (loss_2 / len(output_4[tmp_j]) * st.l1_reg_norm) /
                        st.iter_to_update)
                loss_list_1.append(sum(list_loss_tmp))

                loss_tmp[count_loss] += sum(list_loss_tmp).data.cpu().numpy()
                if (EMS.total_train_iter + 1) % st.iter_to_update == 0:
                    EMS.train_aux_loss[count_loss].append(loss_tmp[count_loss])
                    loss_tmp[count_loss] = 0
                count_loss += 1
            """ L1 reg"""
            # norm = torch.FloatTensor([0]).cuda()
            # for parameter in model.parameters():
            #     norm += torch.norm(parameter, p=1)
            # loss_list_1.append(norm * st.l1_reg)

            loss = sum(loss_list_1)
            loss.backward()
            torch.cuda.empty_cache()
            loss_tmp_total += loss.data.cpu().numpy()

            #TODO :  optimize the model param
            if (EMS.total_train_iter + 1) % st.iter_to_update == 0:
                optimizer.step()
                optimizer.zero_grad()
                """ pyplot """
                EMS.total_train_step += 1
                EMS.train_step.append(EMS.total_train_step)
                EMS.train_loss.append(loss_tmp_total)
                """ print the train loss and tensorboard"""
                if (EMS.total_train_step) % 10 == 0:
                    # print('time : ', time.time() - start)
                    print('Epoch [%d/%d], Step [%d/%d],  Loss: %.4f' %
                          (epoch, config.num_epochs, (i + 1),
                           (num_data // (config.batch_size)), loss_tmp_total))
                loss_tmp_total = 0

            EMS.total_train_iter += 1
            # scheduler.step(epoch + i / len(train_loader))
        """ val """
        if Validation == True:
            print("------------------  val  --------------------------")
            if fst.flag_cropping == True and fst.flag_eval_cropping == True:
                dict_result = ut.eval_classification_model_cropped_input(
                    config, fold, val_loader, model, criterion_cls)
            elif fst.flag_translation == True and fst.flag_eval_translation == True:
                dict_result = ut.eval_classification_model_esemble(
                    config, fold, val_loader, model, criterion_cls)
            elif fst.flag_MC_dropout == True:
                dict_result = ut.eval_classification_model_MC_dropout(
                    config, fold, val_loader, model, criterion_cls)
            else:
                dict_result = ut.eval_classification_model(
                    config, fold, val_loader, model, criterion_cls)
            val_loss = dict_result['Loss']
            acc = dict_result['Acc']
            auc = dict_result['AUC']

            print('Fold : %d, Epoch [%d/%d] val Loss = %f val Acc = %f' %
                  (fold, epoch, config.num_epochs, val_loss, acc))
            """ save the metric """
            EMS.dict_val_metric['val_loss'].append(val_loss)
            EMS.dict_val_metric['val_acc'].append(acc)
            if fst.flag_loss_2 == True:
                for tmp_i in range(len(st.lambda_aux)):
                    EMS.dict_val_metric['val_acc_aux'][tmp_i].append(
                        dict_result['Acc_aux'][tmp_i])
            EMS.dict_val_metric['val_auc'].append(auc)
            EMS.val_step.append(EMS.total_train_step)

            n_stacking_loss_for_selection = 5
            if len(EMS.dict_val_metric['val_loss_queue']
                   ) > n_stacking_loss_for_selection:
                EMS.dict_val_metric['val_loss_queue'].popleft()
            EMS.dict_val_metric['val_loss_queue'].append(val_loss)
            EMS.dict_val_metric['val_mean_loss'].append(
                np.mean(EMS.dict_val_metric['val_loss_queue']))
            """ save model """
            for i_tmp in range(len(list_selected_EMS)):
                save_flag = ut.model_save_through_validation(
                    fold,
                    epoch,
                    EMS=EMS,
                    selected_EMS=list_selected_EMS[i_tmp],
                    ES=list_ES[i_tmp],
                    model=model,
                    dir_save_model=list_dir_save_model[i_tmp],
                    metric_1=st.list_standard_eval[i_tmp],
                    metric_2='',
                    save_flag=False)

        if Test_flag == True:
            print(
                "------------------  test _ test dataset  --------------------------"
            )
            """ load data """
            if fst.flag_cropping == True and fst.flag_eval_cropping == True:
                print("eval : cropping")
                dict_result = ut.eval_classification_model_cropped_input(
                    config, fold, test_loader, model, criterion_cls)
            elif fst.flag_translation == True and fst.flag_eval_translation == True:
                print("eval : assemble")
                dict_result = ut.eval_classification_model_esemble(
                    config, fold, test_loader, model, criterion_cls)
            elif fst.flag_MC_dropout == True:
                dict_result = ut.eval_classification_model_MC_dropout(
                    config, fold, test_loader, model, criterion_cls)
            else:
                print("eval : whole image")
                dict_result = ut.eval_classification_model(
                    config, fold, test_loader, model, criterion_cls)
            acc = dict_result['Acc']
            test_loss = dict_result['Loss']
            """ pyplot """
            EMS.test_acc.append(acc)
            if fst.flag_loss_2 == True:
                for tmp_i in range(len(st.lambda_aux)):
                    EMS.test_acc_aux[tmp_i].append(
                        dict_result['Acc_aux'][tmp_i])
            EMS.test_loss.append(test_loss)
            EMS.test_step.append(EMS.total_train_step)

            print('number of test samples : {}'.format(len(
                test_loader.dataset)))
            print('Fold : %d, Epoch [%d/%d] test Loss = %f test Acc = %f' %
                  (fold, epoch, config.num_epochs, test_loss, acc))
        """ learning rate decay"""
        EMS.LR.append(optimizer.param_groups[0]['lr'])
        # scheduler.step()
        # scheduler.step(val_loss)
        """ plot the chat """
        if epoch % 1 == 0:
            ut.plot_training_info_1(fold,
                                    dir_pyplot,
                                    EMS,
                                    flag='percentile',
                                    flag_match=False)

        ##TODO : early stop only if all of metric has been stopped
        tmp_count = 0
        for i in range(len(list_ES)):
            if list_ES[i].early_stop == True:
                tmp_count += 1
        if tmp_count == len(list_ES):
            break
    """ release the model """
    del model, EMS
    torch.cuda.empty_cache()
Пример #14
0
def run(fold):
    dfx = pd.read_csv(config.TRAINING_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    device = torch.device("cuda")
    model_config = transformers.RobertaConfig.from_pretrained(
        config.ROBERTA_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")
    print(f"Training is Starting for fold={fold}")

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)
        #print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path=f"model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
Пример #15
0
def train():
    # Check NNabla version
    if utils.get_nnabla_version_integer() < 11900:
        raise ValueError(
            'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0'
        )

    parser, args = get_train_args()

    # Get context.
    ctx = get_extension_context(args.context, device_id=args.device_id)
    comm = CommunicatorWrapper(ctx)
    nn.set_default_context(comm.ctx)
    ext = import_extension_module(args.context)

    # Monitors
    # setting up monitors for logging
    monitor_path = args.output
    monitor = Monitor(monitor_path)

    monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1)
    monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1)
    monitor_validation_loss = MonitorSeries('Validation loss',
                                            monitor,
                                            interval=1)
    monitor_lr = MonitorSeries('learning rate', monitor, interval=1)
    monitor_time = MonitorTimeElapsed("training time per iteration",
                                      monitor,
                                      interval=1)

    if comm.rank == 0:
        print("Mixing coef. is {}, i.e., MDL = {}*TD-Loss + FD-Loss".format(
            args.mcoef, args.mcoef))
        if not os.path.isdir(args.output):
            os.makedirs(args.output)

    # Initialize DataIterator for MUSDB.
    train_source, valid_source, args = load_datasources(parser, args)

    train_iter = data_iterator(train_source,
                               args.batch_size,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    valid_iter = data_iterator(valid_source,
                               1,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    if comm.n_procs > 1:
        train_iter = train_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

        valid_iter = valid_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

    # Calculate maxiter per GPU device.
    max_iter = int((train_source._size // args.batch_size) // comm.n_procs)
    weight_decay = args.weight_decay * comm.n_procs

    print("max_iter", max_iter)

    # Calculate the statistics (mean and variance) of the dataset
    scaler_mean, scaler_std = utils.get_statistics(args, train_source)

    max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft,
                                         args.bandwidth)

    unmix = OpenUnmix_CrossNet(input_mean=scaler_mean,
                               input_scale=scaler_std,
                               nb_channels=args.nb_channels,
                               hidden_size=args.hidden_size,
                               n_fft=args.nfft,
                               n_hop=args.nhop,
                               max_bin=max_bin)

    # Create input variables.
    mixture_audio = nn.Variable([args.batch_size] +
                                list(train_source._get_data(0)[0].shape))
    target_audio = nn.Variable([args.batch_size] +
                               list(train_source._get_data(0)[1].shape))

    vmixture_audio = nn.Variable(
        [1] + [2, valid_source.sample_rate * args.valid_dur])
    vtarget_audio = nn.Variable([1] +
                                [8, valid_source.sample_rate * args.valid_dur])

    # create training graph
    mix_spec, M_hat, pred = unmix(mixture_audio)
    Y = Spectrogram(*STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop),
                    mono=(unmix.nb_channels == 1))
    loss_f = mse_loss(mix_spec, M_hat, Y)
    loss_t = sdr_loss(mixture_audio, pred, target_audio)
    loss = args.mcoef * loss_t + loss_f
    loss.persistent = True

    # Create Solver and set parameters.
    solver = S.Adam(args.lr)
    solver.set_parameters(nn.get_parameters())

    # create validation graph
    vmix_spec, vM_hat, vpred = unmix(vmixture_audio, test=True)
    vY = Spectrogram(*STFT(vtarget_audio, n_fft=unmix.n_fft,
                           n_hop=unmix.n_hop),
                     mono=(unmix.nb_channels == 1))
    vloss_f = mse_loss(vmix_spec, vM_hat, vY)
    vloss_t = sdr_loss(vmixture_audio, vpred, vtarget_audio)
    vloss = args.mcoef * vloss_t + vloss_f
    vloss.persistent = True

    # Initialize Early Stopping
    es = utils.EarlyStopping(patience=args.patience)

    # Initialize LR Scheduler (ReduceLROnPlateau)
    lr_scheduler = ReduceLROnPlateau(lr=args.lr,
                                     factor=args.lr_decay_gamma,
                                     patience=args.lr_decay_patience)
    best_epoch = 0

    # Training loop.
    for epoch in trange(args.epochs):
        # TRAINING
        losses = utils.AverageMeter()
        for batch in range(max_iter):
            mixture_audio.d, target_audio.d = train_iter.next()
            solver.zero_grad()
            loss.forward(clear_no_need_grad=True)
            if comm.n_procs > 1:
                all_reduce_callback = comm.get_all_reduce_callback()
                loss.backward(clear_buffer=True,
                              communicator_callbacks=all_reduce_callback)
            else:
                loss.backward(clear_buffer=True)
            solver.weight_decay(weight_decay)
            solver.update()
            losses.update(loss.d.copy(), args.batch_size)
        training_loss = losses.avg

        # clear cache memory
        ext.clear_memory_cache()

        # VALIDATION
        vlosses = utils.AverageMeter()
        for batch in range(int(valid_source._size // comm.n_procs)):
            x, y = valid_iter.next()
            dur = int(valid_source.sample_rate * args.valid_dur)
            sp, cnt = 0, 0
            loss_tmp = nn.NdArray()
            loss_tmp.zero()
            while 1:
                vmixture_audio.d = x[Ellipsis, sp:sp + dur]
                vtarget_audio.d = y[Ellipsis, sp:sp + dur]
                vloss.forward(clear_no_need_grad=True)
                cnt += 1
                sp += dur
                loss_tmp += vloss.data
                if x[Ellipsis,
                     sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur:
                    break
            loss_tmp = loss_tmp / cnt
            if comm.n_procs > 1:
                comm.all_reduce(loss_tmp, division=True, inplace=True)
            vlosses.update(loss_tmp.data.copy(), 1)
        validation_loss = vlosses.avg

        # clear cache memory
        ext.clear_memory_cache()

        lr = lr_scheduler.update_lr(validation_loss, epoch=epoch)
        solver.set_learning_rate(lr)
        stop = es.step(validation_loss)

        if comm.rank == 0:
            monitor_best_epoch.add(epoch, best_epoch)
            monitor_traing_loss.add(epoch, training_loss)
            monitor_validation_loss.add(epoch, validation_loss)
            monitor_lr.add(epoch, lr)
            monitor_time.add(epoch)

            if validation_loss == es.best:
                # save best model
                nn.save_parameters(os.path.join(args.output, 'best_xumx.h5'))
                best_epoch = epoch

        if stop:
            print("Apply Early Stopping")
            break
Пример #16
0
def train():
    parser, args = get_args()

    # Get context.
    ctx = get_extension_context(args.context, device_id=args.device_id)
    nn.set_default_context(ctx)

    # Initialize DataIterator for MNIST.
    train_source, valid_source, args = data.load_datasources(
        parser, args, rng=RandomState(42))

    train_iter = data_iterator(train_source,
                               args.batch_size,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    valid_iter = data_iterator(valid_source,
                               args.batch_size,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    scaler_mean, scaler_std = get_statistics(args, train_source)

    max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft,
                                         args.bandwidth)

    unmix = model.OpenUnmix(input_mean=scaler_mean,
                            input_scale=scaler_std,
                            nb_channels=args.nb_channels,
                            hidden_size=args.hidden_size,
                            n_fft=args.nfft,
                            n_hop=args.nhop,
                            max_bin=max_bin,
                            sample_rate=train_source.sample_rate)

    # Create input variables.
    audio_shape = [args.batch_size] + list(train_source._get_data(0)[0].shape)
    mixture_audio = nn.Variable(audio_shape)
    target_audio = nn.Variable(audio_shape)

    vmixture_audio = nn.Variable(audio_shape)
    vtarget_audio = nn.Variable(audio_shape)

    # create train graph
    pred_spec = unmix(mixture_audio, test=False)
    pred_spec.persistent = True

    target_spec = model.Spectrogram(*model.STFT(target_audio,
                                                n_fft=unmix.n_fft,
                                                n_hop=unmix.n_hop),
                                    mono=(unmix.nb_channels == 1))

    loss = F.mean(F.squared_error(pred_spec, target_spec), axis=1)

    # Create Solver.
    solver = S.Adam(args.lr)
    solver.set_parameters(nn.get_parameters())

    # Training loop.
    t = tqdm.trange(1, args.epochs + 1, disable=args.quiet)
    es = utils.EarlyStopping(patience=args.patience)

    for epoch in t:
        # TRAINING
        t.set_description("Training Epoch")
        b = tqdm.trange(0,
                        train_source._size // args.batch_size,
                        disable=args.quiet)
        losses = utils.AverageMeter()
        for batch in b:
            mixture_audio.d, target_audio.d = train_iter.next()
            b.set_description("Training Batch")
            solver.zero_grad()
            loss.forward(clear_no_need_grad=True)
            loss.backward(clear_buffer=True)
            solver.weight_decay(args.weight_decay)
            solver.update()
            losses.update(loss.d.copy().mean())
            b.set_postfix(train_loss=losses.avg)

        # VALIDATION
        vlosses = utils.AverageMeter()
        for batch in range(valid_source._size):
            # Create new validation input variables for every batch
            vmixture_audio.d, vtarget_audio.d = valid_iter.next()
            # create validation graph
            vpred_spec = unmix(vmixture_audio, test=True)
            vpred_spec.persistent = True

            vtarget_spec = model.Spectrogram(*model.STFT(vtarget_audio,
                                                         n_fft=unmix.n_fft,
                                                         n_hop=unmix.n_hop),
                                             mono=(unmix.nb_channels == 1))
            vloss = F.mean(F.squared_error(vpred_spec, vtarget_spec), axis=1)

            vloss.forward(clear_buffer=True)
            vlosses.update(vloss.d.copy().mean())

        t.set_postfix(train_loss=losses.avg, val_loss=vlosses.avg)

        stop = es.step(vlosses.avg)
        is_best = vlosses.avg == es.best

        # save current model
        nn.save_parameters(
            os.path.join(args.output, 'checkpoint_%s.h5' % args.target))

        if is_best:
            best_epoch = epoch
            nn.save_parameters(os.path.join(args.output,
                                            '%s.h5' % args.target))

        if stop:
            print("Apply Early Stopping")
            break
Пример #17
0
def retrain(args):
    # load dataset
    g_homo, g_list, pairs, labels, train_mask, val_mask, test_mask = u.load_data(
        args['name'], args['train_size'])

    # transfer
    pairs = t.from_numpy(pairs).to(args['device'])
    labels = t.from_numpy(labels).to(args['device'])
    train_mask = t.from_numpy(train_mask).to(args['device'])
    val_mask = t.from_numpy(val_mask).to(args['device'])
    test_mask = t.from_numpy(test_mask).to(args['device'])
    feat1 = t.randn(g_homo.number_of_nodes(),
                    args['in_feats']).to(args['device'])
    feat2 = t.randn(g_list[0].number_of_nodes(),
                    args['in_feats']).to(args['device'])
    labels = labels.view(-1, 1).to(dtype=t.float32)

    # model
    if args['model'] == 'SRG':
        model = m.SRG(rgcn_in_feats=args['in_feats'],
                      rgcn_out_feats=args['embedding_size'],
                      rgcn_num_blocks=args['num_b'],
                      rgcn_dropout=0.,
                      han_num_meta_path=args['num_meta_path'],
                      han_in_feats=args['in_feats'],
                      han_hidden_feats=args['embedding_size'],
                      han_head_list=args['head_list'],
                      han_dropout=args['drop_out'],
                      fc_hidden_feats=args['fc_units']
                      ).to(args['device'])
    elif args['model'] == 'SRG_GAT':
        model = m.SRG_GAT(rgcn_in_feats=args['in_feats'],
                          rgcn_out_feats=args['embedding_size'],
                          rgcn_num_blocks=args['num_b'],
                          rgcn_dropout=args['drop_out'],
                          han_num_meta_path=args['num_meta_path'],
                          han_in_feats=args['in_feats'],
                          han_hidden_feats=args['embedding_size'],
                          han_head_list=args['head_list'],
                          han_dropout=args['drop_out'],
                          fc_hidden_feats=args['fc_units']
                          ).to(args['device'])
    elif args['model'] == 'SRG_no_GRU':
        model = m.SRG_no_GRU(gcn_in_feats=args['in_feats'],
                             gcn_out_feats=args['embedding_size'],
                             gcn_num_layers=args['num_l'],
                             han_num_meta_path=args['num_meta_path'],
                             han_in_feats=args['in_feats'],
                             han_hidden_feats=args['embedding_size'],
                             han_head_list=args['head_list'],
                             han_dropout=args['drop_out'],
                             fc_hidden_feats=args['fc_units']
                             ).to(args['device'])
    elif args['model'] == 'SRG_Res':
        model = m.SRG_Res(gcn_in_feats=args['in_feats'],
                          gcn_out_feats=args['embedding_size'],
                          gcn_num_layers=args['num_l'],
                          han_num_meta_path=args['num_meta_path'],
                          han_in_feats=args['in_feats'],
                          han_hidden_feats=args['embedding_size'],
                          han_head_list=args['head_list'],
                          han_dropout=args['drop_out'],
                          fc_hidden_feats=args['fc_units']
                          ).to(args['device'])
    elif args['model'] == 'SRG_no_GCN':
        model = m.SRG_no_GCN(han_num_meta_path=args['num_meta_path'],
                             han_in_feats=args['in_feats'],
                             han_hidden_feats=args['embedding_size'],
                             han_head_list=args['head_list'],
                             han_dropout=args['drop_out'],
                             fc_hidden_feats=args['fc_units']
                             ).to(args['device'])
    else:
        raise ValueError('wrong name of the model')

    model.load_state_dict(t.load(args['model_path']))

    # log
    log = []
    mae, rmse = u.evaluate(model, g_homo, feat1, g_list,
                           feat2, pairs, labels, val_mask)
    early_stop = u.EarlyStopping(
        args['model_path'], patience=args['patience'], rmse=rmse, mae=mae)

    # loss, optimizer
    loss_func = t.nn.MSELoss()
    optimizer = t.optim.Adam(
        model.parameters(), lr=args['lr'], weight_decay=args['decay'])

    # train
    for epoch in range(args['epochs']):
        dt = datetime.now()

        model.train()
        y_pred = model(g_homo, feat1, g_list, feat2, pairs)
        loss = loss_func(y_pred[train_mask], labels[train_mask])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_mae, train_rmse = u.metrics(
            y_pred[train_mask].detach(), labels[train_mask])
        val_mae, val_rmse = u.evaluate(
            model, g_homo, feat1, g_list, feat2, pairs, labels, val_mask)
        stop = early_stop.step(val_rmse, val_mae, model)

        elapse = str(datetime.now() - dt)[:10] + '\n'
        log.append(' '.join(str(x) for x in (epoch, train_mae,
                                             train_rmse, val_mae, val_rmse, elapse)))
        print(f'epoch={epoch} | train_MAE={train_mae} | train_RMSE={train_rmse} | val_MAE={val_mae} | val_RMSE={val_rmse} | elapse={elapse}')

        if stop:
            break

    early_stop.load_checkpoint(model)
    test_mae, test_rmse = u.evaluate(
        model, g_homo, feat1, g_list, feat2, pairs, labels, test_mask)
    print(f'test_MAE={test_mae} | test_RMSE={test_rmse}')

    # save log
    with open(args['log_path'], 'a') as f:
        f.writelines(log)
Пример #18
0
def main():
    parser = argparse.ArgumentParser(description='Open Unmix Trainer')
    # Loss parameters
    parser.add_argument('--loss',
                        type=str,
                        default="L2freq",
                        choices=[
                            'L2freq', 'L1freq', 'L2time', 'L1time', 'L2mask',
                            'L1mask', 'SISDRtime', 'SISDRfreq', 'MinSNRsdsdr',
                            'CrossEntropy', 'BinaryCrossEntropy', 'LogL2time',
                            'LogL1time', 'LogL2freq', 'LogL1freq', 'PSA',
                            'SNRPSA', 'Dissimilarity'
                        ],
                        help='kind of loss used during training')

    # Dataset paramaters
    parser.add_argument('--dataset',
                        type=str,
                        default="musdb",
                        choices=[
                            'musdb', 'aligned', 'sourcefolder',
                            'trackfolder_var', 'trackfolder_fix'
                        ],
                        help='Name of the dataset.')

    parser.add_argument('--root', type=str, help='root path of dataset')
    parser.add_argument('--output',
                        type=str,
                        default="open-unmix",
                        help='provide output path base folder name')
    parser.add_argument('--model', type=str, help='Path to checkpoint folder')

    # Trainig Parameters
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--reduce-samples',
                        type=int,
                        default=1,
                        help="reduce training samples by factor n")

    parser.add_argument('--batch-size', type=int, default=16)
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate, defaults to 1e-3')
    parser.add_argument(
        '--patience',
        type=int,
        default=140,
        help='maximum number of epochs to train (default: 140)')
    parser.add_argument('--lr-decay-patience',
                        type=int,
                        default=80,
                        help='lr decay patience for plateau scheduler')
    parser.add_argument('--lr-decay-gamma',
                        type=float,
                        default=0.3,
                        help='gamma of learning rate scheduler decay')
    parser.add_argument('--weight-decay',
                        type=float,
                        default=0.00001,
                        help='weight decay')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='S',
                        help='random seed (default: 42)')

    # Model Parameters
    parser.add_argument('--seq-dur',
                        type=float,
                        default=6.0,
                        help='Sequence duration in seconds'
                        'value of <=0.0 will use full/variable length')
    parser.add_argument(
        '--unidirectional',
        action='store_true',
        default=False,
        help='Use unidirectional LSTM instead of bidirectional')
    parser.add_argument('--nfft',
                        type=int,
                        default=4096,
                        help='STFT fft size and window size')
    parser.add_argument('--nhop', type=int, default=1024, help='STFT hop size')
    parser.add_argument(
        '--hidden-size',
        type=int,
        default=512,
        help='hidden size parameter of dense bottleneck layers')
    parser.add_argument('--bandwidth',
                        type=int,
                        default=16000,
                        help='maximum model bandwidth in herz')
    parser.add_argument('--nb-channels',
                        type=int,
                        default=2,
                        help='set number of channels for model (1, 2)')
    parser.add_argument('--nb-workers',
                        type=int,
                        default=0,
                        help='Number of workers for dataloader.')

    # Misc Parameters
    parser.add_argument('--quiet',
                        action='store_true',
                        default=False,
                        help='less verbose during training')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')

    args, _ = parser.parse_known_args()

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    print("Using GPU:", use_cuda)
    dataloader_kwargs = {
        'num_workers': args.nb_workers,
        'pin_memory': True
    } if use_cuda else {}

    repo_dir = os.path.abspath(os.path.dirname(__file__))
    repo = Repo(repo_dir)
    commit = repo.head.commit.hexsha[:7]

    # use jpg or npy
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    train_dataset, valid_dataset, args = data.load_datasets(parser, args)

    num_train = len(train_dataset)
    indices = list(range(num_train))

    # shuffle train indices once and for all
    np.random.seed(args.seed)
    np.random.shuffle(indices)

    if args.reduce_samples > 1:
        split = int(np.floor(num_train / args.reduce_samples))
        train_idx = indices[:split]
    else:
        train_idx = indices
    sampler = SubsetRandomSampler(train_idx)
    # create output dir if not exist
    target_path = Path(args.output)
    target_path.mkdir(parents=True, exist_ok=True)

    train_sampler = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=args.batch_size,
                                                sampler=sampler,
                                                **dataloader_kwargs)

    stats_sampler = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=1,
                                                sampler=sampler,
                                                **dataloader_kwargs)

    valid_sampler = torch.utils.data.DataLoader(valid_dataset,
                                                batch_size=1,
                                                **dataloader_kwargs)

    if args.model:
        scaler_mean = None
        scaler_std = None
    else:
        scaler_mean, scaler_std = get_statistics(args, stats_sampler)

    max_bin = utils.bandwidth_to_max_bin(train_dataset.sample_rate, args.nfft,
                                         args.bandwidth)
    # SNRPSA: de-compress the scaler in order to avoid an exploding gradient from the  uncompressed initial statistics
    if args.loss == 'SNRPSA':
        power = 2
    else:
        power = 1

    unmix = model.OpenUnmixSingle(
        n_fft=4096,
        n_hop=1024,
        input_is_spectrogram=False,
        hidden_size=args.hidden_size,
        nb_channels=args.nb_channels,
        sample_rate=train_dataset.sample_rate,
        nb_layers=3,
        input_mean=scaler_mean,
        input_scale=scaler_std,
        max_bin=max_bin,
        unidirectional=args.unidirectional,
        power=power,
    ).to(device)
    print('learning rate:')
    print(args.lr)
    optimizer = torch.optim.Adam(unmix.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.lr_decay_gamma,
        patience=args.lr_decay_patience,
        cooldown=10)

    es = utils.EarlyStopping(patience=args.patience)

    # if a model is specified: resume training
    if args.model:
        print('LOADING MODEL')
        model_path = Path(args.model).expanduser()
        with open(Path(model_path,
                       str(len(args.targets)) + '.json'), 'r') as stream:
            results = json.load(stream)

        target_model_path = Path(model_path, "model.chkpnt")
        checkpoint = torch.load(target_model_path, map_location=device)
        unmix.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        # train for another epochs_trained
        t = tqdm.trange(results['epochs_trained'],
                        results['epochs_trained'] + args.epochs + 1,
                        disable=args.quiet)
        train_losses = results['train_loss_history']
        valid_losses = results['valid_loss_history']
        train_times = results['train_time_history']
        best_epoch = results['best_epoch']
        es.best = results['best_loss']
        es.num_bad_epochs = results['num_bad_epochs']
        print('Model loaded')
    # else start from 0
    else:
        t = tqdm.trange(1, args.epochs + 1, disable=args.quiet)
        train_losses = []
        valid_losses = []
        train_times = []
        best_epoch = 0

    for epoch in t:
        t.set_description("Training Epoch")
        end = time.time()
        train_loss = train(args, unmix, device, train_sampler, optimizer)
        valid_loss = valid(args, unmix, device, valid_sampler)
        scheduler.step(valid_loss)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        t.set_postfix(train_loss=train_loss, val_loss=valid_loss)

        stop = es.step(valid_loss)

        if valid_loss == es.best:
            best_epoch = epoch

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': unmix.state_dict(),
                'best_loss': es.best,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict()
            },
            is_best=valid_loss == es.best,
            path=target_path,
        )

        # save params
        params = {
            'epochs_trained': epoch,
            'args': vars(args),
            'best_loss': es.best,
            'best_epoch': best_epoch,
            'train_loss_history': train_losses,
            'valid_loss_history': valid_losses,
            'train_time_history': train_times,
            'num_bad_epochs': es.num_bad_epochs,
            'commit': commit
        }

        with open(Path(target_path,
                       str(len(args.targets)) + '.json'), 'w') as outfile:
            outfile.write(json.dumps(params, indent=4, sort_keys=True))

        train_times.append(time.time() - end)

        if stop:
            print("Apply Early Stopping")
            break
Пример #19
0
def train_and_evaluate(model,
                       train_dataloader,
                       val_dataloader,
                       optimizer,
                       loss_func,
                       metrics,
                       epochs,
                       model_dir,
                       lr_scheduler,
                       restore_file=None):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    train_loss_list, val_loss_list = [], []
    early_stopping = utils.EarlyStopping(patience=20, verbose=True)

    if restore_file is not None:
        restore_path = os.path.join(model_dir, restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_f1 = 0.0  # 可以替换成acc
    for epoch in range(epochs):
        logging.info("lr = {}".format(lr_scheduler.get_last_lr()))
        logging.info("Epoch {}/{}".format(epoch + 1, epochs))

        train_loss = train(model, optimizer, loss_func, train_dataloader,
                           metrics, lr_scheduler)

        val_metircs = evaluate(model, loss_func, val_dataloader, metrics)
        # rmse_record.append(val_metircs['rmse'])
        val_loss = val_metircs['loss']
        # loss_result_list.append((train_loss,val_loss))
        train_loss_list.append(train_loss)
        val_loss_list.append(val_loss)

        val_f1 = val_metircs['acc']
        is_best = val_f1 >= best_val_f1

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=model_dir)

        if is_best:
            logging.info("- Found new best accuracy")
            best_val_f1 = val_f1

            best_json_path = os.path.join(model_dir,
                                          "val_acc_best_weights.json")
            utils.save_dict_to_json(val_metircs, best_json_path)

        last_json_path = os.path.join(model_dir, "val_acc_last_weights.json")
        utils.save_dict_to_json(val_metircs, last_json_path)

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            logging.info("Early stopping!")
            break
    # return rmse_record
    return {"train_loss": train_loss_list, "val_loss": val_loss_list}
Пример #20
0
def main():
    parser = argparse.ArgumentParser(description='Open Unmix Trainer')

    # which target do we want to train?
    # =============================================================================
    #     parser.add_argument('--target', type=str, default='vocals',
    #                         help='target source (will be passed to the dataset)')
    #
    # =============================================================================
    parser.add_argument('--target',
                        type=str,
                        default='tabla',
                        help='target source (will be passed to the dataset)')

    # Dataset paramaters
    parser.add_argument('--dataset',
                        type=str,
                        default="aligned",
                        choices=[
                            'musdb', 'aligned', 'sourcefolder',
                            'trackfolder_var', 'trackfolder_fix'
                        ],
                        help='Name of the dataset.')
    parser.add_argument('--root',
                        type=str,
                        help='root path of dataset',
                        default='../rec_data_final/')
    parser.add_argument('--output',
                        type=str,
                        default="../new_models/model_tabla_mtl_ourmix_1",
                        help='provide output path base folder name')
    #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='../out_unmix/model_new_data_aug_tabla_mse_pretrain1')
    #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default="../out_unmix/model_new_data_aug_tabla_mse_pretrain8" )
    #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='../out_unmix/model_new_data_aug_tabla_bce_finetune2')
    parser.add_argument('--model', type=str, help='Path to checkpoint folder')
    #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='umxhq')
    parser.add_argument(
        '--onset-model',
        type=str,
        help='Path to onset detection model weights',
        default=
        "/media/Sharedata/rohit/cnn-onset-det/models/apr4/saved_model_0_80mel-0-16000_1ch_44100.pt"
    )

    # Trainig Parameters
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--batch-size', type=int, default=16)
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate, defaults to 1e-3')
    parser.add_argument(
        '--patience',
        type=int,
        default=140,
        help='maximum number of epochs to train (default: 140)')
    parser.add_argument('--lr-decay-patience',
                        type=int,
                        default=80,
                        help='lr decay patience for plateau scheduler')
    parser.add_argument('--lr-decay-gamma',
                        type=float,
                        default=0.3,
                        help='gamma of learning rate scheduler decay')
    parser.add_argument('--weight-decay',
                        type=float,
                        default=0.00001,
                        help='weight decay')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='S',
                        help='random seed (default: 42)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.0,
                        help='weighting of different loss components')
    parser.add_argument(
        '--finetune',
        type=int,
        default=0,
        help=
        'If true(1), then optimiser states from checkpoint model are reset (required for bce finetuning), false if aim is to resume training from where it was left off'
    )
    parser.add_argument('--onset-thresh',
                        type=float,
                        default=0.3,
                        help='Threshold above which onset is said to occur')
    parser.add_argument(
        '--binarise',
        type=int,
        default=0,
        help=
        'If=1(true), then target novelty function is made binary, if=0(false), then left as it is'
    )
    parser.add_argument(
        '--onset-trainable',
        type=int,
        default=0,
        help=
        'If=1(true), then onsetCNN will also get trained in finetuning stage, if=0(false) then kept fixed'
    )

    # Model Parameters
    parser.add_argument('--seq-dur',
                        type=float,
                        default=6.0,
                        help='Sequence duration in seconds'
                        'value of <=0.0 will use full/variable length')
    parser.add_argument(
        '--unidirectional',
        action='store_true',
        default=False,
        help='Use unidirectional LSTM instead of bidirectional')
    parser.add_argument('--nfft',
                        type=int,
                        default=4096,
                        help='STFT fft size and window size')
    parser.add_argument('--nhop', type=int, default=1024, help='STFT hop size')

    # =============================================================================
    #     parser.add_argument('--nfft', type=int, default=2048,
    #                         help='STFT fft size and window size')
    #     parser.add_argument('--nhop', type=int, default=512,
    #                         help='STFT hop size')
    # =============================================================================

    parser.add_argument('--n-mels',
                        type=int,
                        default=80,
                        help='Number of bins in mel spectrogram')

    parser.add_argument(
        '--hidden-size',
        type=int,
        default=512,
        help='hidden size parameter of dense bottleneck layers')
    parser.add_argument('--bandwidth',
                        type=int,
                        default=16000,
                        help='maximum model bandwidth in herz')
    parser.add_argument('--nb-channels',
                        type=int,
                        default=2,
                        help='set number of channels for model (1, 2)')
    parser.add_argument('--nb-workers',
                        type=int,
                        default=4,
                        help='Number of workers for dataloader.')

    # Misc Parameters
    parser.add_argument('--quiet',
                        action='store_true',
                        default=False,
                        help='less verbose during training')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')

    args, _ = parser.parse_known_args()

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    print("Using GPU:", use_cuda)
    print("Using Torchaudio: ", utils._torchaudio_available())
    dataloader_kwargs = {
        'num_workers': args.nb_workers,
        'pin_memory': True
    } if use_cuda else {}

    repo_dir = os.path.abspath(os.path.dirname(__file__))
    repo = Repo(repo_dir)
    commit = repo.head.commit.hexsha[:7]

    # use jpg or npy
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    torch.autograd.set_detect_anomaly(True)

    train_dataset, valid_dataset, args = data.load_datasets(parser, args)
    print("TRAIN DATASET", train_dataset)
    print("VALID DATASET", valid_dataset)

    # create output dir if not exist
    target_path = Path(args.output)
    target_path.mkdir(parents=True, exist_ok=True)

    train_sampler = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=args.batch_size,
                                                shuffle=True,
                                                **dataloader_kwargs)
    valid_sampler = torch.utils.data.DataLoader(valid_dataset,
                                                batch_size=1,
                                                **dataloader_kwargs)

    if args.model:
        scaler_mean = None
        scaler_std = None
    else:
        scaler_mean, scaler_std = get_statistics(args, train_dataset)

    max_bin = utils.bandwidth_to_max_bin(train_dataset.sample_rate, args.nfft,
                                         args.bandwidth)

    unmix = model_mtl.OpenUnmix_mtl(
        input_mean=scaler_mean,
        input_scale=scaler_std,
        nb_channels=args.nb_channels,
        hidden_size=args.hidden_size,
        n_fft=args.nfft,
        n_hop=args.nhop,
        max_bin=max_bin,
        sample_rate=train_dataset.sample_rate).to(device)

    #Read trained onset detection network (Model through which target spectrogram is passed)
    detect_onset = model.onsetCNN().to(device)
    detect_onset.load_state_dict(
        torch.load(args.onset_model, map_location='cuda:0'))

    #Model through which separated output is passed
    # detect_onset_training = model.onsetCNN().to(device)
    # detect_onset_training.load_state_dict(torch.load(args.onset_model, map_location='cuda:0'))

    for child in detect_onset.children():
        for param in child.parameters():
            param.requires_grad = False

    #If onset trainable is false, then we want to keep the weights of this moel fixed
    # if (args.onset_trainable == 0):
    #     for child in detect_onset_training.children():
    #         for param in child.parameters():
    #             param.requires_grad = False

    # #FOR CHECKING, REMOVE LATER
    # for child in detect_onset_training.children():
    #     for param in child.parameters():
    #         print(param.requires_grad)

    optimizer = torch.optim.Adam(unmix.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.lr_decay_gamma,
        patience=args.lr_decay_patience,
        cooldown=10)

    es = utils.EarlyStopping(patience=args.patience)

    # if a model is specified: resume training
    if args.model:
        model_path = Path(args.model).expanduser()
        with open(Path(model_path, args.target + '.json'), 'r') as stream:
            results = json.load(stream)

        target_model_path = Path(model_path, args.target + ".chkpnt")
        checkpoint = torch.load(target_model_path, map_location=device)
        unmix.load_state_dict(checkpoint['state_dict'])

        #Only when onse is trainable and when that finetuning is being resumed from a point where it is left off, then read the onset state_dict
        # if ((args.onset_trainable==1)and(args.finetune==0)):
        #     detect_onset_training.load_state_dict(checkpoint['onset_state_dict'])
        #     print("Reading saved onset model")
        # else:
        #     print("Not reading saved onset model")

        if (args.finetune == 0):
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            # train for another epochs_trained
            t = tqdm.trange(results['epochs_trained'],
                            results['epochs_trained'] + args.epochs + 1,
                            disable=args.quiet)
            print("PICKUP WHERE LEFT OFF", args.finetune)
            train_losses = results['train_loss_history']
            train_mse_losses = results['train_mse_loss_history']
            train_bce_losses = results['train_bce_loss_history']
            valid_losses = results['valid_loss_history']
            valid_mse_losses = results['valid_mse_loss_history']
            valid_bce_losses = results['valid_bce_loss_history']
            train_times = results['train_time_history']
            best_epoch = results['best_epoch']

            es.best = results['best_loss']
            es.num_bad_epochs = results['num_bad_epochs']

        else:
            t = tqdm.trange(1, args.epochs + 1, disable=args.quiet)
            train_losses = []
            train_mse_losses = []
            train_bce_losses = []
            print("NOT PICKUP WHERE LEFT OFF", args.finetune)
            valid_losses = []
            valid_mse_losses = []
            valid_bce_losses = []

            train_times = []
            best_epoch = 0

        #es.best = results['best_loss']
        #es.num_bad_epochs = results['num_bad_epochs']
    # else start from 0
    else:
        t = tqdm.trange(1, args.epochs + 1, disable=args.quiet)
        train_losses = []
        train_mse_losses = []
        train_bce_losses = []

        valid_losses = []
        valid_mse_losses = []
        valid_bce_losses = []

        train_times = []
        best_epoch = 0

    for epoch in t:
        t.set_description("Training Epoch")
        end = time.time()
        train_loss, train_mse_loss, train_bce_loss = train(
            args,
            unmix,
            device,
            train_sampler,
            optimizer,
            detect_onset=detect_onset)
        #train_mse_loss = train(args, unmix, device, train_sampler, optimizer, detect_onset=detect_onset)[1]
        #train_bce_loss = train(args, unmix, device, train_sampler, optimizer, detect_onset=detect_onset)[2]

        valid_loss, valid_mse_loss, valid_bce_loss = valid(
            args, unmix, device, valid_sampler, detect_onset=detect_onset)
        #valid_mse_loss = valid(args, unmix, device, valid_sampler, detect_onset=detect_onset)[1]
        #valid_bce_loss = valid(args, unmix, device, valid_sampler, detect_onset=detect_onset)[2]

        scheduler.step(valid_loss)
        train_losses.append(train_loss)
        train_mse_losses.append(train_mse_loss)
        train_bce_losses.append(train_bce_loss)

        valid_losses.append(valid_loss)
        valid_mse_losses.append(valid_mse_loss)
        valid_bce_losses.append(valid_bce_loss)

        t.set_postfix(train_loss=train_loss, val_loss=valid_loss)

        stop = es.step(valid_loss)

        #from matplotlib import pyplot as plt

        # =============================================================================
        #         plt.figure(figsize=(16,12))
        #         plt.subplot(2, 2, 1)
        #         plt.title("Training loss")
        #         plt.plot(train_losses,label="Training")
        #         plt.xlabel("Iterations")
        #         plt.ylabel("Loss")
        #         plt.legend()
        #         plt.show()
        #         #plt.savefig(Path(target_path, "train_plot.pdf"))
        #
        #         plt.figure(figsize=(16,12))
        #         plt.subplot(2, 2, 2)
        #         plt.title("Validation loss")
        #         plt.plot(valid_losses,label="Validation")
        #         plt.xlabel("Iterations")
        #         plt.ylabel("Loss")
        #         plt.legend()
        #         plt.show()
        #         #plt.savefig(Path(target_path, "val_plot.pdf"))
        # =============================================================================

        if valid_loss == es.best:
            best_epoch = epoch

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': unmix.state_dict(),
                'best_loss': es.best,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'onset_state_dict': detect_onset.state_dict()
            },
            is_best=valid_loss == es.best,
            path=target_path,
            target=args.target)

        # save params
        params = {
            'epochs_trained': epoch,
            'args': vars(args),
            'best_loss': es.best,
            'best_epoch': best_epoch,
            'train_loss_history': train_losses,
            'train_mse_loss_history': train_mse_losses,
            'train_bce_loss_history': train_bce_losses,
            'valid_loss_history': valid_losses,
            'valid_mse_loss_history': valid_mse_losses,
            'valid_bce_loss_history': valid_bce_losses,
            'train_time_history': train_times,
            'num_bad_epochs': es.num_bad_epochs,
            'commit': commit
        }

        with open(Path(target_path, args.target + '.json'), 'w') as outfile:
            outfile.write(json.dumps(params, indent=4, sort_keys=True))

        train_times.append(time.time() - end)

        if stop:
            print("Apply Early Stopping")
            break


# =============================================================================
#     plt.figure(figsize=(16,12))
#     plt.subplot(2, 2, 1)
#     plt.title("Training loss")
#     #plt.plot(train_losses,label="Training")
#     plt.plot(train_losses,label="Training")
#     plt.xlabel("Iterations")
#     plt.ylabel("Loss")
#     plt.legend()
#     #plt.show()
#
#     plt.figure(figsize=(16,12))
#     plt.subplot(2, 2, 2)
#     plt.title("Validation loss")
#     plt.plot(valid_losses,label="Validation")
#     plt.xlabel("Iterations")
#     plt.ylabel("Loss")
#     plt.legend()
#     plt.show()
#     plt.savefig(Path(target_path, "train_val_plot.pdf"))
#     #plt.savefig(Path(target_path, "train_plot.pdf"))
# =============================================================================

    print("TRAINING DONE!!")

    plt.figure()
    plt.title("Training loss")
    plt.plot(train_losses, label="Training")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(Path(target_path, "train_plot.pdf"))

    plt.figure()
    plt.title("Validation loss")
    plt.plot(valid_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(Path(target_path, "val_plot.pdf"))

    plt.figure()
    plt.title("Training BCE loss")
    plt.plot(train_bce_losses, label="Training")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(Path(target_path, "train_bce_plot.pdf"))

    plt.figure()
    plt.title("Validation BCE loss")
    plt.plot(valid_bce_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(Path(target_path, "val_bce_plot.pdf"))

    plt.figure()
    plt.title("Training MSE loss")
    plt.plot(train_mse_losses, label="Training")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(Path(target_path, "train_mse_plot.pdf"))

    plt.figure()
    plt.title("Validation MSE loss")
    plt.plot(valid_mse_losses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(Path(target_path, "val_mse_plot.pdf"))
Пример #21
0
def train_deep():
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torch.utils.data import Dataset
    from model import FCN
    from torch.optim import lr_scheduler

    def train(model, device, train_loader, optimizer):
        model.train()
        train_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            train_loss += loss.item()
            loss.backward()
            optimizer.step()

        return train_loss / len(train_loader.dataset)

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(
                    output, target,
                    reduction="sum").item()  # sum up batch loss
                pred = output.argmax(
                    dim=1,
                    keepdim=True)  # get the index of the max log-probability
                correct += pred.eq(target.view_as(pred)).sum().item()

        test_loss /= len(test_loader.dataset)
        test_acc = 100.0 * correct / len(test_loader.dataset)

        return test_loss, test_acc

    # training settings
    batch_size = 32
    test_batch_size = 1000
    epochs = 500
    patience = 30  # for early stopping
    use_cuda = torch.cuda.is_available()

    torch.manual_seed(9)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        PoseDataset([root_dir / d for d in train_data_dirs]),
        batch_size=batch_size,
        shuffle=True,
        **kwargs,
    )
    test_loader = torch.utils.data.DataLoader(
        PoseDataset([root_dir / d for d in test_data_dirs], mode="test"),
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )

    model = FCN().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3, amsgrad=True)

    early_stopping = utils.EarlyStopping(patience, Path("results"))
    for epoch in range(1, epochs + 1):
        train_loss = train(model, device, train_loader, optimizer)
        test_loss, test_acc = test(model, device, test_loader)
        print(f"epoch: {epoch:>3}, train_loss: {train_loss:.4f}, ", end="")
        print(f"test_loss: {test_loss:.4f}, test_acc: {test_acc:.3f}")

        early_stopping(test_loss, test_acc, model)

        if early_stopping.early_stop:
            print("Early stopping activated")
            break

    print(f"deep model acc: {early_stopping.best_acc}")
Пример #22
0
    acc = 100.0 * correct / total
    if acc > best_acc:
        best_acc = acc
    return acc, best_acc


if __name__ == "__main__":
    try:
        # trial get next parameter from network morphism tuner
        RCV_CONFIG = nni.get_next_parameter()
        logger.debug(RCV_CONFIG)

        parse_rev_args(RCV_CONFIG)
        train_acc = 0.0
        best_acc = 0.0
        early_stop = utils.EarlyStopping(mode="max")
        for ep in range(args.epochs):
            train_acc = train(ep)
            test_acc, best_acc = test(ep)
            nni.report_intermediate_result(test_acc)
            logger.debug(test_acc)
            if early_stop.step(test_acc):
                break

        # trial report best_acc to tuner
        nni.report_final_result(best_acc)
    except Exception as exception:
        logger.exception(exception)
        raise
Пример #23
0
#plt.imshow(grid_img.permute(1, 2, 0))
#plt.figure()
#plt.title('Ground Truths')
#gt_grid = vutils.make_grid(Y, nrow=4)
#plt.imshow(gt_grid.permute(1,2,0))


model = models.unet(n_channels=3, n_classes=1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

#summary(model, input_size=(3, 144, 144))

opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.95)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', patience=5, verbose=True)
early_stopping = utils.EarlyStopping(patience=8, verbose=True)

print('='*30)
print('Training')
print('='*30)
epoch_train_loss = []
epoch_val_loss = []
epoch_train_dsc = []
epoch_val_dsc = []


for epoch in range(num_epochs):
    train_losses = []
    train_dsc = []
    val_losses = []
    val_dsc = []
Пример #24
0
        p.start()

        single_acc, current_ep = train_eval(init_search_space_point,
                                            RCV_CONFIG,
                                            int(nni.get_sequence_id()))
        print("HPO-" + str(train_num) + ",hyperparameters:" +
              str(init_search_space_point) + ",best_val_acc:" +
              str(single_acc))

        best_final = single_acc
        searched_space_point = init_search_space_point

        if int(nni.get_sequence_id()) > 3 * args.slave - 1:
            dict_first_data = init_search_space_point
            TPE.receive_trial_result(train_num, dict_first_data, single_acc)
            TPEearlystop = utils.EarlyStopping(patience=3, mode="max")

            for train_num in range(1, args.maxTPEsearchNum):
                params = TPE.generate_parameters(train_num)
                start_date = time.strftime('%m/%d/%Y, %H:%M:%S',
                                           time.localtime(time.time()))

                current_hyperparameter = params
                hp_path = experiment_path + '/hyperparameter_epoch/' + str(
                    nni.get_trial_id()) + '/' + str(train_num) + '.json'
                with open(hp_path, 'w') as f:
                    json.dump(
                        {
                            'get_sequence_id': int(nni.get_sequence_id()),
                            'hyperparameter': current_hyperparameter,
                            'epoch': 0,
Пример #25
0
def train(fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, lr, patience, num_warmup_steps):
    dfx = pd.read_csv(training_file)

    df_train = dfx[dfx.kfold != fold].reset_index(drop = True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop = True)

    # 训练集
    train_dataset = TweetDataset(
        tweet = df_train.text.values,
        sentiment = df_train.sentiment.values,
        selected_text = df_train.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )
    # 验证集
    valid_dataset = TweetDataset(
        tweet = df_valid.text.values,
        sentiment = df_valid.sentiment.values,
        selected_text = df_valid.selected_text.values,
        tokenizer = tokenizer,
        max_len = max_len
    )

    train_sampler, valid_sampler = None, None
    if args.shuffle:
        train_sampler = RandomSampler(train_dataset)
        valid_sampler = SequentialSampler(valid_dataset)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = train_batch_size,
        num_workers = 4,
        sampler=train_sampler
    )


    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = valid_batch_size,
        num_workers = 2,
        sampler=valid_sampler
    )

    device = torch.device("cuda")

    model_config = transformers.RobertaConfig.from_pretrained(roberta_path)
    model_config.output_hidden_states = True
    model = TweetModel(roberta_path = roberta_path, conf = model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / train_batch_size * epochs)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.003},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    optimizer = AdamW(optimizer_parameters, lr = lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = num_warmup_steps,
        num_training_steps = num_train_steps
    )

    if args.fp16:
        # try:
        #     from apex import amp
        # except ImportError:
        #     raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)


    # multi-gpu training (should be after apex fp16 initialization)
    if args.parallel:
        model = torch.nn.DataParallel(model)

    es = utils.EarlyStopping(patience = patience, mode = "max")
    print("Training is Starting for fold", fold)

    for epoch in range(epochs):
        train_fn(train_data_loader, model, optimizer, device, scheduler = scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print("Jaccard Score = ", jaccard)
        experiment.log_metric("jaccard", jaccard)
        es(jaccard, model, model_path = f"{save_path}/model_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
    del model, optimizer, scheduler, df_train, df_valid, train_dataset, valid_dataset, train_data_loader, valid_data_loader
    import gc
    gc.collect()
    torch.cuda.empty_cache()
Пример #26
0
def train():
    """
    Train model for a speciied fold
    """
    # Read train csv and dev csv
    df_train = pd.read_csv(config.TRAIN_FILE)
    df_valid = pd.read_csv(config.DEV_FILE)

    # Instantiate TweetDataset with training data
    train_dataset = SiameseDataset(query=df_train.sentence1.values,
                                   question=df_train.sentence2.values,
                                   label=df_train.label.values)

    if os.path.exists(config.train_features):
        train_dataset = load_pkl_data(config.train_features)
    else:
        train_dataset = [item for item in train_dataset]
        save_pkl_data(train_dataset, config.train_features)

    # Instantiate DataLoader with `train_dataset`
    # This is a generator that yields the dataset in batches
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, shuffle=True, batch_size=config.TRAIN_BATCH_SIZE)

    # Instantiate TweetDataset with validation data
    valid_dataset = SiameseDataset(query=df_valid.sentence1.values,
                                   question=df_valid.sentence2.values,
                                   label=df_valid.label.values)

    if os.path.exists(config.valid_features):
        valid_dataset = load_pkl_data(config.valid_features)
    else:
        valid_dataset = [item for item in valid_dataset]
        save_pkl_data(valid_dataset, config.valid_features)

    # Instantiate DataLoader with `valid_dataset`
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE)

    # Set device as `cuda` (GPU)
    device = torch.device("cuda:2")
    # Load pretrained BERT (bert-base-uncased)
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    # Output hidden states
    # This is important to set since we want to concatenate the hidden states from the last 2 BERT layers
    model_config.output_hidden_states = True
    # Instantiate our model with `model_config`
    model = SiameseWmdModel(conf=model_config,
                            pretrained_model_path=config.BERT_PATH)
    # Move the model to the GPU
    model.to(device)

    # Calculate the number of training steps
    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    # Get the list of named parameters
    param_optimizer = list(model.named_parameters())
    # Specify parameters where weight decay shouldn't be applied
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # Define two sets of parameters: those with weight decay, and those without
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    # Instantiate AdamW optimizer with our two sets of parameters, and a learning rate of 3e-5
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    # Create a scheduler to set the learning rate at each training step
    # "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
    # Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # Apply early stopping with patience of 2
    # This means to stop training new epochs when 2 rounds have passed without any improvement
    es = utils.EarlyStopping(patience=2, mode="max")

    thresholds = [0.1, 0.15, 0.20]
    best_f1 = 0
    best_th = 0
    for threshold in thresholds:

        # I'm training only for 3 epochs even though I specified 5!!!
        for epoch in range(config.EPOCHS):
            train_fn(train_data_loader,
                     model,
                     optimizer,
                     device,
                     scheduler=scheduler,
                     threshold=threshold)
            acc, f1, auc = eval_fn(valid_data_loader, model, device)

            # logger.info(f"acc = {acc}, f1 score = {f1}")
            es(f1, model, model_path=config.MODEL_SAVE_PATH)
            if es.early_stop:
                if f1 > best_f1:
                    best_f1 = f1
                    best_th = threshold
                print("Early stopping ********")
                break
    logger.info(f"best threshold:{best_th}, best f1 :{best_f1}")
Пример #27
0
def main():
    parser = argparse.ArgumentParser(description='Open Unmix Trainer')

    # which target do we want to train?
    parser.add_argument('--target',
                        type=str,
                        default='vocals',
                        help='target source (will be passed to the dataset)')

    # Dataset paramaters
    parser.add_argument('--dataset',
                        type=str,
                        default="aligned",
                        choices=[
                            'musdb', 'aligned', 'sourcefolder',
                            'trackfolder_var', 'trackfolder_fix'
                        ],
                        help='Name of the dataset.')
    parser.add_argument('--root',
                        type=str,
                        help='root path of dataset',
                        default='../rec_data_new/')
    parser.add_argument('--output',
                        type=str,
                        default="../out_unmix/model_new_data_aug_tl",
                        help='provide output path base folder name')
    #parser.add_argument('--model', type=str, help='Path to checkpoint folder' , default='../out_unmix/model_new_data')
    #parser.add_argument('--model', type=str, help='Path to checkpoint folder')
    parser.add_argument('--model',
                        type=str,
                        help='Path to checkpoint folder',
                        default='umxhq')

    # Trainig Parameters
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--batch-size', type=int, default=32)
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate, defaults to 1e-3')
    parser.add_argument(
        '--patience',
        type=int,
        default=140,
        help='maximum number of epochs to train (default: 140)')
    parser.add_argument('--lr-decay-patience',
                        type=int,
                        default=80,
                        help='lr decay patience for plateau scheduler')
    parser.add_argument('--lr-decay-gamma',
                        type=float,
                        default=0.3,
                        help='gamma of learning rate scheduler decay')
    parser.add_argument('--weight-decay',
                        type=float,
                        default=0.0000000001,
                        help='weight decay')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='S',
                        help='random seed (default: 42)')

    # Model Parameters
    parser.add_argument('--seq-dur',
                        type=float,
                        default=6.0,
                        help='Sequence duration in seconds'
                        'value of <=0.0 will use full/variable length')
    parser.add_argument(
        '--unidirectional',
        action='store_true',
        default=False,
        help='Use unidirectional LSTM instead of bidirectional')
    parser.add_argument('--nfft',
                        type=int,
                        default=4096,
                        help='STFT fft size and window size')
    parser.add_argument('--nhop', type=int, default=1024, help='STFT hop size')
    parser.add_argument(
        '--hidden-size',
        type=int,
        default=512,
        help='hidden size parameter of dense bottleneck layers')
    parser.add_argument('--bandwidth',
                        type=int,
                        default=16000,
                        help='maximum model bandwidth in herz')
    parser.add_argument('--nb-channels',
                        type=int,
                        default=2,
                        help='set number of channels for model (1, 2)')
    parser.add_argument('--nb-workers',
                        type=int,
                        default=4,
                        help='Number of workers for dataloader.')

    # Misc Parameters
    parser.add_argument('--quiet',
                        action='store_true',
                        default=False,
                        help='less verbose during training')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')

    args, _ = parser.parse_known_args()

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    print("Using GPU:", use_cuda)
    print("Using Torchaudio: ", utils._torchaudio_available())
    dataloader_kwargs = {
        'num_workers': args.nb_workers,
        'pin_memory': True
    } if use_cuda else {}

    repo_dir = os.path.abspath(os.path.dirname(__file__))
    repo = Repo(repo_dir)
    commit = repo.head.commit.hexsha[:7]

    # use jpg or npy
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    train_dataset, valid_dataset, args = data.load_datasets(parser, args)
    print("TRAIN DATASET", train_dataset)
    print("VALID DATASET", valid_dataset)

    # create output dir if not exist
    target_path = Path(args.output)
    target_path.mkdir(parents=True, exist_ok=True)

    train_sampler = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=args.batch_size,
                                                shuffle=True,
                                                **dataloader_kwargs)
    valid_sampler = torch.utils.data.DataLoader(valid_dataset,
                                                batch_size=1,
                                                **dataloader_kwargs)

    # =============================================================================
    #     if args.model:
    #         scaler_mean = None
    #         scaler_std = None
    #
    #     else:
    # =============================================================================
    scaler_mean, scaler_std = get_statistics(args, train_dataset)

    max_bin = utils.bandwidth_to_max_bin(train_dataset.sample_rate, args.nfft,
                                         args.bandwidth)

    unmix = model.OpenUnmix(input_mean=scaler_mean,
                            input_scale=scaler_std,
                            nb_channels=args.nb_channels,
                            hidden_size=args.hidden_size,
                            n_fft=args.nfft,
                            n_hop=args.nhop,
                            max_bin=max_bin,
                            sample_rate=train_dataset.sample_rate).to(device)

    optimizer = torch.optim.Adam(unmix.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.lr_decay_gamma,
        patience=args.lr_decay_patience,
        cooldown=10)

    es = utils.EarlyStopping(patience=args.patience)

    # if a model is specified: resume training
    if args.model:
        # disable progress bar
        err = io.StringIO()
        with redirect_stderr(err):
            unmix = torch.hub.load('sigsep/open-unmix-pytorch',
                                   'umxhq',
                                   target=args.target,
                                   device=device,
                                   pretrained=True)
# =============================================================================
#         model_path = Path(args.model).expanduser()
#         with open(Path(model_path, args.target + '.json'), 'r') as stream:
#             results = json.load(stream)
#
#         target_model_path = Path(model_path, args.target + ".chkpnt")
#         checkpoint = torch.load(target_model_path, map_location=device)
#         unmix.load_state_dict(checkpoint['state_dict'])
#         optimizer.load_state_dict(checkpoint['optimizer'])
#         scheduler.load_state_dict(checkpoint['scheduler'])
#         # train for another epochs_trained
#         t = tqdm.trange(
#             results['epochs_trained'],
#             results['epochs_trained'] + args.epochs + 1,
#             disable=args.quiet
#         )
#         train_losses = results['train_loss_history']
#         valid_losses = results['valid_loss_history']
#         train_times = results['train_time_history']
#         best_epoch = results['best_epoch']
#         es.best = results['best_loss']
#         es.num_bad_epochs = results['num_bad_epochs']
#     # else start from 0
# =============================================================================

    t = tqdm.trange(1, args.epochs + 1, disable=args.quiet)
    train_losses = []
    valid_losses = []
    train_times = []
    best_epoch = 0

    for epoch in t:
        t.set_description("Training Epoch")
        end = time.time()
        train_loss = train(args, unmix, device, train_sampler, optimizer)
        valid_loss = valid(args, unmix, device, valid_sampler)
        scheduler.step(valid_loss)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        t.set_postfix(train_loss=train_loss, val_loss=valid_loss)

        stop = es.step(valid_loss)

        from matplotlib import pyplot as plt

        plt.figure(figsize=(16, 12))
        plt.subplot(2, 2, 1)
        plt.title("Training loss")
        plt.plot(train_losses, label="Training")
        plt.xlabel("Iterations")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()
        #plt.savefig(Path(target_path, "train_plot.pdf"))

        plt.figure(figsize=(16, 12))
        plt.subplot(2, 2, 2)
        plt.title("Validation loss")
        plt.plot(valid_losses, label="Validation")
        plt.xlabel("Iterations")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()
        #plt.savefig(Path(target_path, "val_plot.pdf"))

        if valid_loss == es.best:
            best_epoch = epoch

        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': unmix.state_dict(),
                'best_loss': es.best,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict()
            },
            is_best=valid_loss == es.best,
            path=target_path,
            target=args.target)

        # save params
        params = {
            'epochs_trained': epoch,
            'args': vars(args),
            'best_loss': es.best,
            'best_epoch': best_epoch,
            'train_loss_history': train_losses,
            'valid_loss_history': valid_losses,
            'train_time_history': train_times,
            'num_bad_epochs': es.num_bad_epochs,
            'commit': commit
        }

        with open(Path(target_path, args.target + '.json'), 'w') as outfile:
            outfile.write(json.dumps(params, indent=4, sort_keys=True))

        train_times.append(time.time() - end)

        if stop:
            print("Apply Early Stopping")
            break
    def run():

        torch.manual_seed(seed)

        device = xm.xla_device()
        model = MX.to(device)

        # DataLoaders
        train_dataset = TweetDataset(args=args,
                                     df=train_df,
                                     mode="train",
                                     fold=args.fold_index,
                                     tokenizer=tokenizer)
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   sampler=train_sampler,
                                                   drop_last=False,
                                                   num_workers=2)

        valid_dataset = TweetDataset(args=args,
                                     df=train_df,
                                     mode="valid",
                                     fold=args.fold_index,
                                     tokenizer=tokenizer)
        valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=args.batch_size,
                                  sampler=valid_sampler,
                                  num_workers=1,
                                  drop_last=False)

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                args.weight_decay
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]

        num_train_steps = int(num_train_dpoints / args.batch_size /
                              xm.xrt_world_size() * args.epochs)

        optimizer = AdamW(optimizer_parameters,
                          lr=args.learning_rate * xm.xrt_world_size(),
                          eps=args.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.num_warmup_steps,
            num_training_steps=num_train_steps)

        xm.master_print("Training is Starting ...... ")
        best_jac = 0
        best_loss = 9999
        early_stopping = utils.EarlyStopping(patience=3, mode="max")

        for epoch in range(args.epochs):
            para_loader = pl.ParallelLoader(train_loader, [device])
            train_loss = train(args, para_loader.per_device_loader(device),
                               model, device, optimizer, scheduler, epoch, f,
                               args.max_seq_len)

            para_loader = pl.ParallelLoader(valid_loader, [device])
            valid_jac, valid_loss = valid(
                args, para_loader.per_device_loader(device), model, device,
                tokenizer, epoch, f, args.max_seq_len)

            jac = xm.mesh_reduce("jac_reduce", valid_jac, reduce_fn)
            val_loss = xm.mesh_reduce("valid_loss_reduce", valid_loss,
                                      reduce_fn)

            xm.master_print(f"**** Epoch {epoch+1} **==>** Jaccard = {jac}")
            xm.master_print(
                f"**** Epoch {epoch+1} **==>** valid_loss = {val_loss}")

            log_ = f"**** Epoch {epoch+1} **==>** Jaccard = {jac}"

            f.write(log_ + "\n\n")

            if jac > best_jac:
                xm.master_print("**** Model Improved !!!! Saving Model")
                xm.save(
                    model.state_dict(),
                    os.path.join(args.save_path, f"fold_{args.fold_index}"))
                best_jac = jac

            early_stopping(jac, model, "none")

            if early_stopping.early_stop:
                print("Early stopping")
                break
Пример #29
0
    'decomp' : torch.zeros((1,), requires_grad=True, device=device),
    'ihm'    : torch.zeros((1,), requires_grad=True, device=device),
    'los'    : torch.zeros((1,), requires_grad=True, device=device),
    'pheno'  : torch.zeros((1,), requires_grad=True, device=device),
    'readmit': torch.zeros((1,), requires_grad=True, device=device),
    'ltm': torch.zeros((1,), requires_grad=True, device=device),

}
#If using uncertianty weighting, use the below optimizer to add the log vars
#Leave out readmit task due to poor performance
#optimizer = torch.optim.Adam(([p for p in model.parameters()] + [log_var[t] for t in log_var if t != 'readmit']), lr=learning_rate) #for uncertainty weighting

#-------------------- define optimizer and other hyperparams ----------------#
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=10, gamma=0.3)
early_stopper = utils.EarlyStopping(experiment_name = experiment)

#------------------------load word embedings---------------------#
embedding_layer = nn.Embedding(vectors.shape[0], vectors.shape[1])
embedding_layer.weight.data.copy_(torch.from_numpy(vectors))
embedding_layer.weight.requires_grad = False


#-------------------------- Define the train/val/test dataloaders ------------#
#As long as datapaths are correct, no changes should be necessary.
train_mm_dataset = MultiModal_Dataset(train_val_ts_root_dir, train_val_text_root_dir,train_val_tab_root_dir, train_listfile, discretizer, train_val_starttime_path,\
        regression, bin_type, None, ihm_pos, los_pos,  use_text, use_ts, use_tab, decay, w2i_lookup, max_text_length, max_num_notes)
val_mm_dataset = MultiModal_Dataset(train_val_ts_root_dir, train_val_text_root_dir, train_val_tab_root_dir, val_listfile, discretizer, train_val_starttime_path,\
        regression, bin_type, None, ihm_pos, los_pos,  use_text, use_ts, use_tab, decay, w2i_lookup, max_text_length, max_num_notes)
test_mm_dataset = MultiModal_Dataset(test_ts_root_dir, test_text_root_dir,test_tab_root_dir, test_listfile, discretizer, test_starttime_path,\
        regression, bin_type, None, ihm_pos, los_pos,  use_text, use_ts, use_tab, decay, w2i_lookup, max_text_length, max_num_notes)
Пример #30
0
def run(fold=0):
    # kfold type of data input
    data = pd.read_csv(config.TRAIN_FOLDS_FILE)
    df_train = data[data['kfold'] != fold].reset_index(drop=True)
    df_valid = data[data['kfold'] == fold].reset_index(drop=True)

    train_data = CommentData(comments=df_train['Comment'],
                             labels=df_train['Label_encoded'],
                             sentiments=df_train['Sentiment_encoded'])

    train_dataloader = torch.utils.data.DataLoader(
        train_data,
        batch_size=config.TRAIN_BATCH_SIZE,
        # num_workers = 4
    )

    valid_data = CommentData(comments=df_valid['Comment'],
                             labels=df_valid['Label_encoded'],
                             sentiments=df_valid['Sentiment_encoded'])

    valid_dataloader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=config.VALID_BATCH_SIZE,
        # num_workers = 4
    )

    device = torch.device('cuda')

    model_config = RobertaConfig.from_pretrained(config.ROBERTA_PATH)
    model_config.output_hidden_states = True

    model = SentimentModel(model_config, config.OUTPUT_SIZE)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.001
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }]

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    # train_fn(data_loader, model, device, optimizer, scheduler=None)
    train_loss_rec = []
    eval_loss_rec = []

    early_stopping = utils.EarlyStopping(patience=5, mode='min')

    for epoch in range(config.EPOCHS):
        print(f'########### fold = {fold} epoch = {epoch} ############')
        loss_train = engine.train_fn(data_loader=train_dataloader,
                                     model=model,
                                     device=device,
                                     optimizer=optimizer,
                                     scheduler=scheduler)

        train_loss_rec.append(loss_train)

        losses_eval = engine.eval_fn(valid_dataloader, model, device)
        eval_loss_rec.append(losses_eval)

        print(f'train_loss = {loss_train}  eval_loss = {losses_eval}')
        # print(f'save model_{fold}.bin')
        # torch.save(model.state_dict(), config.OUTPUT_PATH + f'/model_{fold}.bin')
        early_stopping(losses_eval,
                       model,
                       model_path=config.OUTPUT_PATH +
                       f'/model_label_{fold}.bin')
        if early_stopping.early_stop:
            print('Early stopping')
            break