Exemplo n.º 1
0
    def __init__(self,
                 data,
                 model_config,
                 learning_config,
                 pretrained_weight,
                 early_stopping=True,
                 patience=100,
                 json_path=None,
                 vocab_path=None,
                 mapping_path=None,
                 odir=None):
        self.data = data
        self.model_config = model_config
        # max length of a sequence (max nodes among graphs)
        self.seq_max_length = data[MAX_N_NODES]
        self.learning_config = learning_config
        self.pretrained_weight = pretrained_weight
        self.is_cuda = learning_config['cuda']

        # with open(vocab_path+'/../mapping.json', 'r') as f:
        with open(mapping_path, 'r') as f:
            self.mapping = json.load(f)

        self.labels = self.data[LABELS]
        self.graphs_names = self.data[GNAMES]

        data_graph = self.data[GRAPH]
        data_nclasses = self.data[N_CLASSES]
        if N_RELS in self.data:
            data_nrels = self.data[N_RELS]
        else:
            data_nrels = None

        if N_ENTITIES in self.data:
            data_nentities = self.data[N_ENTITIES]
        else:
            data_nentities = None

        self.model = Model(g=data_graph,
                           config_params=model_config,
                           n_classes=data_nclasses,
                           n_rels=data_nrels,
                           n_entities=data_nentities,
                           is_cuda=self.is_cuda,
                           seq_dim=self.seq_max_length,
                           batch_size=1,
                           json_path=json_path,
                           vocab_path=vocab_path)

        if early_stopping:
            self.early_stopping = EarlyStopping(patience=patience,
                                                verbose=True)

        # Output folder to save train / test data
        if odir is None:
            odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S")
        self.odir = odir
Exemplo n.º 2
0
    def fit(self, trajectories, iteration):
        self._set_logging(iteration)
        # self.optimizer = torch.optim.Adam(
        #     filter(lambda x: x.requires_grad, self.model.parameters()),
        #     lr=self.args.vae_lr,
        # )

        dataset_train, dataset_test = self.preprocess_trajectories(
            trajectories)

        num_trajectories = len(dataset_train)
        batch_size = num_trajectories // self.args.vae_batches

        loader_train = torch.utils.data.DataLoader(dataset_train,
                                                   shuffle=True,
                                                   batch_size=batch_size,
                                                   num_workers=2)
        loader_test = torch.utils.data.DataLoader(dataset_test,
                                                  shuffle=False,
                                                  batch_size=batch_size,
                                                  num_workers=2)

        num_max_epoch = self.args.vae_max_fit_epoch

        if 'Point2D' in self.args.env_name:
            min_delta = 0.05
        else:
            min_delta = 0.005  # TODO: tune

        early_stopping = EarlyStopping(mode='min',
                                       min_delta=min_delta,
                                       patience=num_max_epoch // 10)

        t = tqdm(range(num_max_epoch))
        for i_epoch in t:
            loss_train = self._train(loader_train, i_epoch)

            t.set_description('train loss: {}'.format(loss_train))

            if i_epoch == 0 or (i_epoch + 1) % (num_max_epoch // 5) == 0:
                loss_test = self._eval(loader_test, i_epoch)
                # print('epoch: {}\tloss: {}'.format(i_epoch, losses.avg))
                t.write('epoch: {}\ttrain loss: {}\ttest_loss: {}'.format(
                    i_epoch, loss_train, loss_test))

            if i_epoch > num_max_epoch // 5:
                if early_stopping.step(
                        loss_train):  # doesn't start tracking until epoch 300
                    t.close()
                    break

        model = copy.deepcopy(self.model).cpu()
        mean = self.mean.clone()
        std = self.std.clone()
        torch.save(dict(model=model, mean=mean, std=std), self.filename)
        print('wrote vae model to {}'.format(self.filename))
Exemplo n.º 3
0
def train(params, m, data_x, data_y):
  es = EarlyStopping(min_delta = params.min_delta, patience = params.patience)

  # optimizer
  optimizer = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), lr = params.init_learning_rate)
  
  n_batch = data_x.train_size // params.bs if data_x.train_size % params.bs == 0 else data_x.train_size // params.bs + 1
  data_idxs = list(range(data_x.train_size))
  
  # number of iterations
  cur_it = 0
  # write to tensorboard
  writer = SummaryWriter('./history/{}'.format(params.emb_out_path)) if params.write_tfboard else None

  nll_dev = math.inf
  best_nll_dev = math.inf
  kld_dev = math.inf

  for i in range(params.ep):
    shuffle(data_idxs)
    for j in range(n_batch):
      train_idxs = data_idxs[j * params.bs: (j + 1) * params.bs]
      # get padded & sorted batch idxs and 
      padded_batch_x, batch_x_lens = get_batch(train_idxs, data_x, data_x.train_idxs, data_x.train_lens, params.cuda)
      padded_batch_y, batch_y_lens = get_batch(train_idxs, data_y, data_y.train_idxs, data_y.train_lens, params.cuda)

      optimizer.zero_grad()
      m.train()
      nll_batch, kld_batch = m(padded_batch_x, batch_x_lens, padded_batch_y, batch_y_lens)

      cur_it += 1
      loss_batch, alpha = calc_loss_batch(params, nll_batch, kld_batch, cur_it, n_batch)

      loss_batch.backward()
      optimizer.step()

      out_parallel(i, j, n_batch, loss_batch, nll_batch, kld_batch, best_nll_dev, nll_dev, kld_dev, es.num_bad_epochs)
      update_tensorboard(writer, loss_batch, nll_batch, kld_batch, alpha, nll_dev, kld_dev, cur_it)

      if cur_it % params.VAL_EVERY == 0:
        sys.stdout.write('\n') 
        sys.stdout.flush()
        # validation 
        nll_dev, kld_dev = test(params, m, data_x, data_y)
        if es.step(nll_dev):
          print('\nEarly Stoped.')
          return
        elif es.is_better(nll_dev, best_nll_dev):
          best_nll_dev = nll_dev
          # save model
          m.save_embedding(params, data_x, 'x')
          m.save_embedding(params, data_y, 'y')
          m.save_model(params, data_x, data_y, optimizer)
Exemplo n.º 4
0
    def __init__(self, teacher_model, student_model, device, config, fold_num):
        self.config = config
        self.epoch = 0
        self.start_epoch = 0
        self.fold_num = fold_num
        if self.config.stage2:
            self.base_dir = f'./result/stage2/{config.dir}/{config.dir}_fold_{config.fold_num}'
        else:
            self.base_dir = f'./result/{config.dir}/{config.dir}_fold_{config.fold_num}'
        os.makedirs(self.base_dir, exist_ok=True)
        self.log_path = f'{self.base_dir}/log.txt'
        self.best_summary_loss = 10**5

        self.teacher_model = teacher_model
        self.teacher_mode.eval()

        self.student_model = student_model
        self.device = device
        self.wandb = True

        self.cutmix = self.config.cutmix_ratio
        self.fmix = self.config.fmix_ratio
        self.smix = self.config.smix_ratio

        self.es = EarlyStopping(patience=5)

        self.scaler = GradScaler()
        self.amp = self.config.amp
        param_optimizer = list(self.student_model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        self.optimizer, self.scheduler = get_optimizer(
            self.student_model, self.config.optimizer_name,
            self.config.optimizer_params, self.config.scheduler_name,
            self.config.scheduler_params, self.config.n_epochs)

        self.criterion = get_criterion(self.config.criterion_name,
                                       self.config.criterion_params)
        self.log(f'Fitter prepared. Device is {self.device}')
        set_wandb(self.config, fold_num)
Exemplo n.º 5
0
def train_model(args: Namespace):
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    for fold in args.folds:
        es = EarlyStopping(patience=args.es_patience)
        print(f'STARTING FOLD {fold}')
        torch.cuda.empty_cache()
        run_name = args.run_name + '_' + str(fold)

        device = torch.device('cuda:0')
        model, optimizer = load_model(args, fold)
        train_loader, val_loader = get_dataloaders(args, fold)
        loss_function = cross_entropy
        scheduler = define_lr_scheduler(args, optimizer, train_loader)

        train_loss = AverageMeter()
        dev_loss = AverageMeter()

        log = get_logger('zindi' + ":" + run_name)
        checkpoint_path = 'checkpoints/' + run_name
        checkpointer = ModelCheckpoint(checkpoint_path,
                                       'checkpoint',
                                       n_saved=5,
                                       score_name='NLL_loss',
                                       save_as_state_dict=False,
                                       require_empty=False)

        for epoch in range(args.max_epochs):
            train_loss.reset()
            dev_loss.reset()

            train(args, model, device, train_loader, optimizer, scheduler,
                  loss_function, epoch, train_loss, log)
            dev_log_loss = validate(model, device, val_loader, loss_function,
                                    epoch, dev_loss, log)

            checkpointer(
                {
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'model_class': args.model_class,
                    'model_hyperparams': args.model_hyperparams,
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': dev_loss.value
                },
                score=-dev_log_loss)

            if es.step(dev_log_loss):
                print('EARLY STOPPING ON EPOCH ' + str(epoch))
                break
Exemplo n.º 6
0
def runEpoch(loader, model, loss_fn, optimizer, scheduler, device, 
                vis, epoch, iFold, folds_pbar, avg_training_loss,
                avg_training_score, logger_options, optimizer_options, 
                msg_dict):

    ## ======================================= Early Stop ======================================= ##
    early_stop = False
    if not (optimizer_options['early_stopping'] == ""):
        #['min', '0.01', '21']
        mode = optimizer_options['early_stopping'][0]
        min_delta = float(optimizer_options['early_stopping'][1])
        patience = int(optimizer_options['early_stopping'][2])
        early_stopping = EarlyStopping(mode=mode, min_delta=min_delta, patience=patience)
    ## ======================================= Early Stop ======================================= ##
    
    
    trainer = Engine(model, optimizer, loss_fn, scheduler, loader, 
                            optimizer_options["accumulate_count"], device,
                            use_half_precision=optimizer_options["use_half_precision"],
                            score_type="f1")
    

    iteration_pbar = ProgressBar(loader, desc="Iteration", pb_len=optimizer_options['max_iterations'])
    max_iterations = iteration_pbar.total

    for iteration, data_dict in enumerate(iteration_pbar):

        images = data_dict['data']
        phase_annotations = data_dict['target']

        ### ============================== Training ============================== ###
        train_loss, train_score = trainer(images.to(device=device), phase_annotations.to(device=device))
        avg_training_loss.update(train_loss)
        avg_training_score.update(train_score)
        msg_dict['ATL'] = avg_training_loss.get_value()[0]
        msg_dict['ATS'] = avg_training_score.get_value()[0]
        ### ============================== Training ============================== ###
        
        ### ============================== Plot ============================== ###
        if ((iteration) % logger_options["vislogger_interval"] == 0):
            # print(avg_training_loss.get_value()[0])
            vis.line(X=np.array([epoch + (iteration/iteration_pbar.total)]), 
                        Y=np.array([avg_training_loss.get_value()[0]]),
                        update='append', win='Training_Loss_Fold_'+str(iFold+1), 
                        name='Training Loss Fold '+str(iFold+1))
        ### ============================== Plot ============================== ###

        if early_stop:
            iteration_pbar.close()
            print("\n==========================\nEarly stop\n==========================\n")
            break                
        
        folds_pbar.update_message(msg_dict=msg_dict)
        
        if iteration == max_iterations:
            iteration_pbar.refresh()
            iteration_pbar.close()
            break
Exemplo n.º 7
0
    def train(self, image, epochs, enable_es=1):
        graph = tf.Graph()
        with tf.Session(graph=graph) as session:
            tf.set_random_seed(1234)

            self.__create_inputs()
            new_saver = self.__create_graph(self.meta_file)
            self.__create_loss_optimizer()

            # slim.model_analyzer.analyze_vars(tf.trainable_variables() , print_info=True)

            early_stopping = EarlyStopping(patience=30, min_delta=1e-1)

            tf.global_variables_initializer().run()

            new_saver.restore(session,self.latest_checkpoint)
            
            recons_loss = list()
            print('Starting optimization...')
            for cur_epoch in range(epochs + 1):


                dict_loss = self.__train_epoch(session,image)
                list_loss = list(dict_loss.values())

                if np.isnan(list_loss[0]):
                    print ('Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.')
                    sys.exit()

                if(cur_epoch % 20 == 0 or cur_epoch==0):
                    print('EPOCH: {} | dist: {} '.format(cur_epoch, list_loss[0]))
                    
                recons_loss.append(list_loss[0])
                #Early stopping
                if(cur_epoch>50 and enable_es==1 and early_stopping.stop(list_loss[0])):
                    print('Early Stopping!')
                    print('EPOCH: {} | dist: {} '.format(cur_epoch, list_loss[0]))
                    break


            z_infer =  session.run(self.z)
            x_recons = session.run(self.x_recons)

        return z_infer, x_recons, recons_loss
Exemplo n.º 8
0
def training(model, epoches, lr, wd):
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    criterion = nn.CrossEntropyLoss()
    early_stopper = EarlyStopping(model_dir, patience=PATIENCE)
    for ep in range(epoches):
        model = train_epoch(ep, model, optimizer, criterion, early_stopper)
        optimizer = learning_rate_decay(optimizer)
        
        if early_stopper.early_stop:
            return model
    return model
Exemplo n.º 9
0
def run_ppi(train_loader,
            val_loader,
            test_loader,
            model,
            epochs,
            lr,
            weight_decay,
            patience,
            device,
            logger=True):
    model.to(device)
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
    #                                                        mode='min',
    #                                                        factor=0.5,
    #                                                        patience=20)
    early_stopping = EarlyStopping(patience=patience)

    # path1 = osp.join(osp.dirname(osp.realpath(__file__)), 'runs', 'train')
    # path2 = osp.join(osp.dirname(osp.realpath(__file__)), 'runs', 'val')

    # writer_train = SummaryWriter(path1)
    # writer_val = SummaryWriter(path2)

    for epoch in range(1, epochs + 1):

        train_loss = train_ppi(model, optimizer, train_loader, device)

        val_loss = val_ppi(model, val_loader, device)
        test_f1 = test_ppi(model, test_loader, device)
        # scheduler.step(val_loss)

        # writer_train.add_scalar('training loss', train_loss, epoch)
        # writer_val.add_scalar('val loss', val_loss, epoch)

        if logger:
            print(
                '{:03d}: Train Loss: {:.4f}, Val Loss: {:.4f}, Test F1: {:.4f}'
                .format(epoch, train_loss, val_loss, test_f1))

        early_stopping(val_loss, test_f1)
        if early_stopping.early_stop:
            best_val_loss = early_stopping.best_score
            best_test_f1 = early_stopping.best_score_acc
            print('Val Loss: {:.3f}, Test F1 Score: {:.4f}'.format(
                best_val_loss, best_test_f1))
            break
Exemplo n.º 10
0
def run_std(runs, file_name, **kwargs):
    train_accs, val_accs, test_accs = [], [], []
    for i in range(runs):
        kwargs["model"].reset_parameters()

        es = EarlyStopping(patience=20)

        train_node_acc, val_node_acc, test_node_acc = trainer(
            early_stopping=es, **kwargs)

        train_accs.append(train_node_acc)
        val_accs.append(val_node_acc)
        test_accs.append(test_node_acc)

    with open(file_name, "w") as std_file:
        std_file.write(f"{np.mean(train_accs)}, {np.std(train_accs)}\n")
        std_file.write(f"{np.mean(val_accs)}, {np.std(val_accs)}\n")
        std_file.write(f"{np.mean(test_accs)}, {np.std(test_accs)}\n")
Exemplo n.º 11
0
 def _get_early_stopper(self):
     return EarlyStopping(self.config['stage%d' %
                                      self.stage]['stopper']['patience'])
Exemplo n.º 12
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = TrainDataSet(X_train,
                                      y_train,
                                      CONV_PAD_LEN,
                                      SENT_PAD_LEN,
                                      word2id,
                                      use_unk=True)

        dev_data_set = TrainDataSet(X_dev,
                                    y_dev,
                                    CONV_PAD_LEN,
                                    SENT_PAD_LEN,
                                    word2id,
                                    use_unk=True)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            # Model is defined in HierarchicalPredictor
            model = HierarchicalPredictor(SENT_EMB_DIM,
                                          SENT_HIDDEN_SIZE,
                                          num_of_vocab,
                                          USE_ELMO=True,
                                          ADD_LINEAR=False)
            model.load_embedding(emb)
            model.deepmoji_model.load_specific_weights(
                PRETRAINED_PATH, exclude_names=['output_layer'])
            model.cuda()
            # model = nn.DataParallel(model)
            # model.to(device)
            optimizer = optim.Adam(model.parameters(),
                                   lr=learning_rate,
                                   amsgrad=True)  #
            # optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                               gamma=opt.gamma)

            if opt.w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif opt.w == 2:
                weight_list = [
                    0.3198680179, 0.246494733, 0.2484349259, 1.74527696
                ]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            else:
                raise ValueError

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('classification reweight: ', weight_list)
            print('binary loss reweight = weight_list_binary',
                  weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal, reduce=False)
                loss_criterion_binary = FocalLoss(gamma=opt.focal,
                                                  reduce=False)  #
            elif opt.loss == 'ce':
                loss_criterion = nn.CrossEntropyLoss(reduce=False)
                loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            # best_model = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                # to ensure shuffle at ever epoch
                train_data_loader = DataLoader(train_data_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

                print('Begin training epoch:', num_epoch, end='...\t')
                sys.stdout.flush()

                # stepping scheduler
                scheduler.step(num_epoch)
                print('Current learning rate', scheduler.get_lr())

                train_loss = 0
                model.train()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo) \
                        in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE):
                    optimizer.zero_grad()
                    elmo_a = elmo_encode(a)
                    elmo_b = elmo_encode(b)
                    elmo_c = elmo_encode(c)

                    pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(),
                                               b_len, c.cuda(), c_len,
                                               emoji_a.cuda(), emoji_b.cuda(),
                                               emoji_c.cuda(), elmo_a, elmo_b,
                                               elmo_c)

                    loss_label = loss_criterion(pred,
                                                e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]

                    loss_binary = loss_criterion_binary(
                        pred2,
                        e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(
                        torch.gather(weight_binary, 0,
                                     e_c_binary.view(-1).cuda()),
                        loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary +
                            LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # loss = torch.matmul(torch.gather(weight, 0, trg.view(-1).cuda()), loss) / trg.view(-1).shape[0]

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * a.shape[0]
                    del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b, emoji_c, e_c, e_c_binary, e_c_emo)\
                        in enumerate(dev_data_loader):
                    with torch.no_grad():

                        elmo_a = elmo_encode(a)
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, pred2, pred3 = model(a.cuda(), a_len, b.cuda(),
                                                   b_len, c.cuda(), c_len,
                                                   emoji_a.cuda(),
                                                   emoji_b.cuda(),
                                                   emoji_c.cuda(), elmo_a,
                                                   elmo_b, elmo_c)

                        loss_label = loss_criterion(
                            pred,
                            e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(
                            torch.gather(weight_label, 0,
                                         e_c.view(-1).cuda()),
                            loss_label) / e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(
                            pred2,
                            e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(
                            torch.gather(weight_binary, 0,
                                         e_c_binary.view(-1).cuda()),
                            loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(
                            pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 *
                                loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * a.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss, elmo_a, elmo_b, elmo_c, e_c_emo, loss_binary, loss_label, loss_emo

                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                # Gold Dev testing...
                print('Gold Dev testing....')
                pred_list_test = []
                model.eval()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b,
                        emoji_c) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        elmo_a = elmo_encode(a)  # , __id2word=ex_id2word
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len,
                                           c.cuda(), c_len, emoji_a.cuda(),
                                           emoji_b.cuda(), emoji_c.cuda(),
                                           elmo_a, elmo_b, elmo_c)

                        pred_list_test.append(pred.data.cpu().numpy())
                    del elmo_a, elmo_b, elmo_c, a, b, c, pred
                pred_list_test = np.argmax(np.concatenate(pred_list_test,
                                                          axis=0),
                                           axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                # Testing
                print('Gold test testing...')
                final_pred_list_test = []
                model.eval()
                for i, (a, a_len, b, b_len, c, c_len, emoji_a, emoji_b,
                        emoji_c) in enumerate(test_data_loader):
                    with torch.no_grad():
                        elmo_a = elmo_encode(a)  # , __id2word=ex_id2word
                        elmo_b = elmo_encode(b)
                        elmo_c = elmo_encode(c)

                        pred, _, _ = model(a.cuda(), a_len, b.cuda(), b_len,
                                           c.cuda(), c_len, emoji_a.cuda(),
                                           emoji_b.cuda(), emoji_c.cuda(),
                                           elmo_a, elmo_b, elmo_c)

                        final_pred_list_test.append(pred.data.cpu().numpy())
                    del elmo_a, elmo_b, elmo_c, a, b, c, pred
                final_pred_list_test = np.argmax(np.concatenate(
                    final_pred_list_test, axis=0),
                                                 axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue

            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)
            del model
            break
Exemplo n.º 13
0
    def __init__(self,
                 data,
                 model_config,
                 learning_config,
                 pretrained_weight,
                 early_stopping=True,
                 patience=100,
                 json_path=None,
                 pickle_folder=None,
                 vocab_path=None,
                 mapping_path=None,
                 odir=None,
                 model_src_path=None,
                 gdot_path=None):
        if model_src_path is not None:
            sys.path.insert(0, model_src_path)
            print('*** [app][__init__] model_src_path', model_src_path)
            from model_edgnn_o import Model
        else:
            from models.model_edgnn_o import Model

        print('*** [app][__init__] gdot_path', gdot_path)

        self.data = data
        self.model_config = model_config
        # max length of a sequence (max nodes among graphs)
        self.learning_config = learning_config
        self.pretrained_weight = pretrained_weight
        self.is_cuda = learning_config['cuda']

        # with open(vocab_path+'/../mapping.json', 'r') as f:
        with open(mapping_path, 'r') as f:
            self.mapping = json.load(f)

        self.labels = self.data[LABELS]
        self.graphs_names = self.data[GNAMES]

        self.data_graph = self.data[GRAPH]

        # save nid and eid to nodes & edges
        # print('self.data_graph[0]', self.data_graph[0])
        # if 'nid' not in self.data_graph[0].ndata:
        # # if True:
        #     for k,g in enumerate(self.data_graph):
        #         g = self.write_nid_eid(g)
        #         self.data_graph[k] = g
        #     # print('self.data_graph', self.data_graph)
        # save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH))

        self.data_nclasses = self.data[N_CLASSES]
        if N_RELS in self.data:
            self.data_nrels = self.data[N_RELS]
        else:
            self.data_nrels = None

        if N_ENTITIES in self.data:
            self.data_nentities = self.data[N_ENTITIES]
        else:
            self.data_nentities = None

        self.ModelObj = Model
        self.model_src_path = model_src_path

        self.model = self.ModelObj(
            g=self.data_graph[0],
            config_params=self.model_config,
            n_classes=self.data_nclasses,
            n_rels=self.data_nrels,
            n_entities=self.data_nentities,
            is_cuda=self.is_cuda,
            batch_size=1,
            #    json_path=json_path,
            #    vocab_path=vocab_path,
            model_src_path=model_src_path)

        if self.is_cuda is True:
            print('[app][__init__] Convert model to use cuda')
            self.model = self.model.cuda()
            # self.model = self.model.to(torch.device('cuda:{}'.format(self.learning_config['gpu'])))

        print('>>> [app][__init__] self.model', self.model)
        print('>>> [app][__init__] Check if model use cuda',
              next(self.model.parameters()).is_cuda)

        # print('*** [app][__init__] Model parameters ***')
        # pp=0
        # for p in list(self.model.parameters()):
        #     nn=1
        #     for s in list(p.size()):
        #         # print('p', p)
        #         print('\t s, nn, nn*s', s, nn, nn*s)
        #         nn = nn*s
        #     pp += nn
        # print('[app][__init__] Total params', pp)

        if early_stopping:
            self.early_stopping = EarlyStopping(patience=patience,
                                                verbose=True)

        # Output folder to save train / test data
        if odir is None:
            odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S")
        self.odir = odir
Exemplo n.º 14
0
class App:
    """
    App inference
    """

    TRAIN_SIZE = 0.7

    def __init__(self,
                 data,
                 model_config,
                 learning_config,
                 pretrained_weight,
                 early_stopping=True,
                 patience=100,
                 json_path=None,
                 pickle_folder=None,
                 vocab_path=None,
                 mapping_path=None,
                 odir=None,
                 model_src_path=None,
                 gdot_path=None):
        if model_src_path is not None:
            sys.path.insert(0, model_src_path)
            print('*** [app][__init__] model_src_path', model_src_path)
            from model_edgnn_o import Model
        else:
            from models.model_edgnn_o import Model

        print('*** [app][__init__] gdot_path', gdot_path)

        self.data = data
        self.model_config = model_config
        # max length of a sequence (max nodes among graphs)
        self.learning_config = learning_config
        self.pretrained_weight = pretrained_weight
        self.is_cuda = learning_config['cuda']

        # with open(vocab_path+'/../mapping.json', 'r') as f:
        with open(mapping_path, 'r') as f:
            self.mapping = json.load(f)

        self.labels = self.data[LABELS]
        self.graphs_names = self.data[GNAMES]

        self.data_graph = self.data[GRAPH]

        # save nid and eid to nodes & edges
        # print('self.data_graph[0]', self.data_graph[0])
        # if 'nid' not in self.data_graph[0].ndata:
        # # if True:
        #     for k,g in enumerate(self.data_graph):
        #         g = self.write_nid_eid(g)
        #         self.data_graph[k] = g
        #     # print('self.data_graph', self.data_graph)
        # save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH))

        self.data_nclasses = self.data[N_CLASSES]
        if N_RELS in self.data:
            self.data_nrels = self.data[N_RELS]
        else:
            self.data_nrels = None

        if N_ENTITIES in self.data:
            self.data_nentities = self.data[N_ENTITIES]
        else:
            self.data_nentities = None

        self.ModelObj = Model
        self.model_src_path = model_src_path

        self.model = self.ModelObj(
            g=self.data_graph[0],
            config_params=self.model_config,
            n_classes=self.data_nclasses,
            n_rels=self.data_nrels,
            n_entities=self.data_nentities,
            is_cuda=self.is_cuda,
            batch_size=1,
            #    json_path=json_path,
            #    vocab_path=vocab_path,
            model_src_path=model_src_path)

        if self.is_cuda is True:
            print('[app][__init__] Convert model to use cuda')
            self.model = self.model.cuda()
            # self.model = self.model.to(torch.device('cuda:{}'.format(self.learning_config['gpu'])))

        print('>>> [app][__init__] self.model', self.model)
        print('>>> [app][__init__] Check if model use cuda',
              next(self.model.parameters()).is_cuda)

        # print('*** [app][__init__] Model parameters ***')
        # pp=0
        # for p in list(self.model.parameters()):
        #     nn=1
        #     for s in list(p.size()):
        #         # print('p', p)
        #         print('\t s, nn, nn*s', s, nn, nn*s)
        #         nn = nn*s
        #     pp += nn
        # print('[app][__init__] Total params', pp)

        if early_stopping:
            self.early_stopping = EarlyStopping(patience=patience,
                                                verbose=True)

        # Output folder to save train / test data
        if odir is None:
            odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S")
        self.odir = odir

    def write_nid_eid(self, g):
        num_nodes = g.number_of_nodes()
        num_edges = g.number_of_edges()
        g.ndata['nid'] = torch.tensor([-1] * num_nodes)
        g.edata['eid'] = torch.tensor([-1] * num_edges)
        # print("self.g.ndata['nid']", g.ndata['nid'])
        # save nodeid and edgeid to each node and edge
        for nid in range(num_nodes):
            g.ndata['nid'][nid] = torch.tensor([nid]).type(torch.LongTensor)
        for eid in range(g.number_of_edges()):
            g.edata['eid'][eid] = torch.tensor([eid]).type(torch.LongTensor)
        return g

    def train(self,
              save_path='',
              k_fold=10,
              train_list_file=None,
              test_list_file=None):
        if self.pretrained_weight is not None:
            self.model = load_checkpoint(self.model, self.pretrained_weight,
                                         self.is_cuda)
        save_dir = save_path.split('/checkpoint')[0]

        loss_fcn = torch.nn.CrossEntropyLoss()

        # initialize graphs
        self.accuracies = np.zeros(k_fold)
        graphs = self.data[GRAPH]  # load all the graphs

        # debug purposes: reshuffle all the data before the splitting
        random_indices = list(range(len(graphs)))
        random.shuffle(random_indices)
        graphs = [graphs[i] for i in random_indices]
        labels = self.labels[random_indices]
        graphs_names = [self.graphs_names[i] for i in random_indices]

        split_train_test = True if train_list_file is None and test_list_file is None else False
        print('[app][train] split_train_test', split_train_test)
        '''
        if split_train_test is True:
            print('[app][train] train_list_file', train_list_file)
            print('[app][train] test_list_file', test_list_file)
            #############################
            # Create new train/test set
            # Split train and test
            #############################
            train_size = int(self.TRAIN_SIZE * len(graphs))
            g_train = graphs[:train_size]
            l_train = labels[:train_size]
            n_train = graphs_names[:train_size]

            g_test = graphs[train_size:]
            l_test = labels[train_size:]
            n_test = graphs_names[train_size:]
            
        else:
            #############################
            # Load train and test graphs from list
            #############################
            train_files = []
            test_files = []
            g_train = []
            l_train = []
            n_train = []
            g_test = []
            l_test = []
            n_test = []
            with open(train_list_file, 'r') as f:
                train_files = [l.strip() for l in f.readlines()]
            with open(test_list_file, 'r') as f:
                test_files = [l.strip() for l in f.readlines()]
            
            for i in range(len(labels)):
                graph_jsonpath = graphs_names[i]
                # print(graph_jsonpath)
                if graph_jsonpath in train_files:
                    g_train.append(graphs[i])
                    l_train.append(labels[i])
                    n_train.append(graphs_names[i])
                if graph_jsonpath in test_files:
                    g_test.append(graphs[i])
                    l_test.append(labels[i])
                    n_test.append(graphs_names[i])

            l_train = torch.Tensor(l_train).type(torch.LongTensor)
            l_test = torch.Tensor(l_test).type(torch.LongTensor)
            if self.is_cuda is True:
                l_train = l_train.cuda()
                l_test = l_test.cuda()
        '''

        print('[app][train] len labels', len(labels))
        print('[app][train] len g_train', len(g_train))
        # print('[app][train] g_train', g_train)

        if not os.path.isdir(self.odir):
            os.makedirs(self.odir)
        save_pickle(g_train, os.path.join(self.odir, 'train'))
        save_pickle(l_train, os.path.join(self.odir, 'train_labels'))
        save_pickle(g_test, os.path.join(self.odir, 'test'))
        save_pickle(l_test, os.path.join(self.odir, 'test_labels'))

        # save graph name list to txt file
        save_txt(n_train, os.path.join(self.odir, 'train_list.txt'))
        save_txt(n_test, os.path.join(self.odir, 'test_list.txt'))

        K = k_fold
        for k in range(K):
            self.model = self.ModelObj(g=self.data_graph[0],
                                       config_params=self.model_config,
                                       n_classes=self.data_nclasses,
                                       n_rels=self.data_nrels,
                                       n_entities=self.data_nentities,
                                       is_cuda=self.is_cuda,
                                       batch_size=1,
                                       model_src_path=self.model_src_path)

            print('*** [app][__init__] Model layers ***')
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    print('\t', name, param.data.type())

            print('>>> [app][__init__] self.model.fc.weight.type',
                  self.model.fc.weight.type())

            optimizer = torch.optim.Adam(
                self.model.parameters(),
                lr=self.learning_config['lr'],
                weight_decay=self.learning_config['weight_decay'])

            start = int(len(g_train) / K) * k
            end = int(len(g_train) / K) * (k + 1)
            print('\n\n\n[app][train] Process new k=' + str(k) + ' | ' +
                  str(start) + '-' + str(end))

            # training batch
            train_batch_graphs = g_train[:start] + g_train[end:]
            train_batch_labels = l_train[list(range(0, start)) +
                                         list(range(end + 1, len(g_train)))]
            train_batch_samples = list(
                map(list, zip(train_batch_graphs, train_batch_labels)))
            train_batches = DataLoader(
                train_batch_samples,
                batch_size=self.learning_config['batch_size'],
                shuffle=True,
                collate_fn=collate)

            # testing batch
            val_batch_graphs = g_train[start:end]
            val_batch_labels = l_train[start:end]
            # print('[app][train] val_batch_graphs', val_batch_graphs)
            print('[app][train] len val_batch_graphs', len(val_batch_graphs))
            print('[app][train] val_batch_graphs[0].number_of_nodes()',
                  val_batch_graphs[0].number_of_nodes())
            print('[app][train] val_batch_graphs[-1].number_of_nodes()',
                  val_batch_graphs[-1].number_of_nodes())
            val_batch = dgl.batch(val_batch_graphs)

            print('[app][train] train_batches size: ', len(train_batches))
            print('[app][train] train_batch_graphs size: ',
                  len(train_batch_graphs))
            print('[app][train] val_batch_graphs size: ',
                  len(val_batch_graphs))
            print('[app][train] train_batches', train_batches)
            print('[app][train] val_batch_labels', val_batch_labels)

            dur = []
            for epoch in range(self.learning_config['epochs']):
                self.model.train()
                if epoch >= 3:
                    t0 = time.time()
                losses = []
                training_accuracies = []
                for iter_idx, (bg, label) in enumerate(train_batches):
                    # print('~~~ [app][train] bg', bg)
                    logits = self.model(bg)
                    if self.learning_config['cuda']:
                        label = label.cuda()
                    loss = loss_fcn(logits, label)
                    losses.append(loss.item())
                    _, indices = torch.max(logits, dim=1)
                    # print('~~~~ logits', logits)
                    # print('------------------')
                    print('\t [app][train] indices', indices)
                    # print('\t label', label)
                    correct = torch.sum(indices == label)
                    training_accuracies.append(correct.item() * 1.0 /
                                               len(label))

                    optimizer.zero_grad()
                    loss.backward(retain_graph=True)
                    # loss.backward()
                    optimizer.step()

                if epoch >= 3:
                    dur.append(time.time() - t0)

                val_acc, val_loss, _ = self.model.eval_graph_classification(
                    val_batch_labels, val_batch)
                print(
                    "[app][train] Epoch {:05d} | Time(s) {:.4f} | train_acc {:.4f} | train_loss {:.4f} | val_acc {:.4f} | val_loss {:.4f}"
                    .format(epoch,
                            np.mean(dur) if dur else 0,
                            np.mean(training_accuracies), np.mean(losses),
                            val_acc, val_loss))

                is_better = self.early_stopping(val_loss, self.model,
                                                save_path)
                if is_better:
                    self.accuracies[k] = val_acc

                if self.early_stopping.early_stop:
                    # Print model's state_dict
                    # print("*** Model's state_dict:")
                    # for param_tensor in self.model.state_dict():
                    #     print(param_tensor, "\t", self.model.state_dict()[param_tensor].size())

                    # # Print optimizer's state_dict
                    # print("*** Optimizer's state_dict:")
                    # for var_name in optimizer.state_dict():
                    #     print(var_name, "\t", optimizer.state_dict()[var_name])

                    # Save state dict
                    # torch.save(self.model.state_dict(), save_dir+'/model_state.pt')

                    # Save model
                    # torch.save({
                    #     'epoch': epoch,
                    #     'model_state_dict': self.model.state_dict(),
                    #     'optimizer_state_dict': optimizer.state_dict(),
                    #     'val_loss': val_loss,
                    # }, save_dir+'/saved')

                    print("[app][train] Early stopping")
                    break

            self.early_stopping.reset()

    def test(self, model_path=''):
        print('[app][test] Test model')

        try:
            print('*** [app][test] Load pre-trained model ' + model_path +
                  ' ***')
            self.model = load_checkpoint(self.model, model_path, self.is_cuda)
        except ValueError as e:
            print('[app][test] Error while loading the model.', e)

        self.save_traintest()

        # print('\n[app][test] Test all')
        # # acc = np.mean(self.accuracies)
        # # acc = self.accuracies
        # graphs = self.data[GRAPH]
        # labels = self.labels
        # self.run_test(graphs, labels)

        graphs = load_pickle(os.path.join(self.odir, 'train'))
        labels = load_pickle(os.path.join(self.odir, 'train_labels'))
        print('\n[app][test] Test on train graphs ({})'.format(len(labels)),
              os.path.join(self.odir, 'train'))
        self.run_test_fold(graphs, labels, fold=300)

        graphs = load_pickle(os.path.join(self.odir, 'test'))
        labels = load_pickle(os.path.join(self.odir, 'test_labels'))
        print('\n[app][test] Test on test graphs ({})'.format(len(labels)),
              os.path.join(self.odir, 'test'))
        self.run_test_fold(graphs, labels, fold=150)

    def test_on_data(self, model_path=''):
        print('[app][test_on_data] Test model')

        try:
            print('*** [app][test_on_data] Load pre-trained model ' +
                  model_path + ' ***')
            self.model = load_checkpoint(self.model, model_path, self.is_cuda)
        except ValueError as e:
            print('Error while loading the model.', e)

        print('\n[app][test_on_data] Test on data')
        # acc = np.mean(self.accuracies)
        # acc = self.accuracies
        graphs = self.data[GRAPH]
        labels = self.labels

        self.run_test(graphs, labels)
        # batch_size = 1024
        # batch_num = len(graphs) // batch_size
        # print('batch_num', batch_num)
        # for batch in range(batch_num):
        #     start = (batch)*batch_size
        #     end = (batch+1)*batch_size
        #     graphs = graphs[start:end]
        #     print(batch, len(graphs))
        #     self.run_test(graphs, labels)

    def save_traintest(self):
        graphs = self.data[GRAPH]  # load all the graphs
        # labels = self.labels
        # graphs_names = self.graphs_names
        # debug purposes: reshuffle all the data before the splitting
        random_indices = list(range(len(graphs)))
        random.shuffle(random_indices)
        graphs = [graphs[i] for i in random_indices]
        labels = self.labels[random_indices]
        graphs_names = [self.graphs_names[i] for i in random_indices]

        if True:
            train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/train_list.txt'
            test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/test_list.txt'

            train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_train_list.txt'
            test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_test_list.txt'

            train_files = []
            test_files = []
            g_train = []
            l_train = []
            n_train = []
            g_test = []
            l_test = []
            n_test = []
            with open(train_list_file, 'r') as f:
                train_files = [l.strip() for l in f.readlines()]
            with open(test_list_file, 'r') as f:
                test_files = [l.strip() for l in f.readlines()]

            for i in range(len(labels)):
                graph_jsonpath = graphs_names[i]
                # print(graph_jsonpath)
                if graph_jsonpath in train_files:
                    g_train.append(graphs[i])
                    l_train.append(labels[i])
                    n_train.append(graphs_names[i])
                if graph_jsonpath in test_files:
                    g_test.append(graphs[i])
                    l_test.append(labels[i])
                    n_test.append(graphs_names[i])

            l_train = torch.Tensor(l_train).type(torch.LongTensor)
            l_test = torch.Tensor(l_test).type(torch.LongTensor)
            if self.is_cuda is True:
                l_train = l_train.cuda()
                l_test = l_test.cuda()

        print('[app][save_traintest] len labels', len(labels))
        print('[app][save_traintest] len l_test', len(l_test))
        print('[app][save_traintest] len l_train', len(l_train))
        tot_bgn = (labels == self.mapping['benign']).sum().item()
        tot_mal = (labels == self.mapping['malware']).sum().item()
        print('[app][save_traintest] tot_bgn', tot_bgn, 'tot_mal', tot_mal)

        if not os.path.isdir(self.odir):
            os.makedirs(self.odir)
        save_pickle(g_train, os.path.join(self.odir, 'train'))
        save_pickle(l_train, os.path.join(self.odir, 'train_labels'))
        save_pickle(g_test, os.path.join(self.odir, 'test'))
        save_pickle(l_test, os.path.join(self.odir, 'test_labels'))

    def run_test_fold(self, graphs, labels, fold=5):
        num_g = len(labels)
        num_g_per_fold = num_g / fold
        cm_all = np.zeros((len(self.mapping), len(self.mapping)))
        # tot_far = 0
        # tot_tpr = 0
        for i in range(fold):
            start_idx = int(i * num_g_per_fold)
            end_idx = int((i + 1) * num_g_per_fold)
            print('* [app][test] Test from {} to {} (total={})'.format(
                start_idx, end_idx, end_idx - start_idx))
            G = graphs[start_idx:end_idx]
            lbls = labels[start_idx:end_idx]
            acc, cm = self.run_test(G, lbls)
            # print('\t ~~ cm', cm)
            cm_all += cm - np.array([[1, 0], [0, 1]])
            # if cm.shape[0] == 2:
            # tot_far += cm[lbl_bng][lbl_mal]

        print(' >> [app][run_test] All FOLD: cm_all', cm_all)

        if len(self.mapping) == 2:
            labels_cpu = labels.cpu()
            lbl_mal = self.mapping['malware']
            lbl_bng = self.mapping['benign']
            n_mal = (labels_cpu == lbl_mal).sum().item()
            n_bgn = (labels_cpu == lbl_bng).sum().item()
            tpr = (cm_all[lbl_mal][lbl_mal] / n_mal * 100).item(
            )  # actual malware that is correctly detected as malware
            far = (cm_all[lbl_bng][lbl_mal] / n_bgn *
                   100).item()  # benign that is incorrectly labeled as malware
            print(' >> [app][run_test] All FOLD: TPR', tpr, 'n_mal', n_mal,
                  ' ||  FAR', far, 'n_bgn', n_bgn)
            total_samples = len(labels)
            total_correct = cm_all[lbl_mal][lbl_mal] + cm_all[lbl_bng][lbl_bng]
            acc_all = (total_correct / total_samples * 100).item()
            print(' >> [app][run_test] All FOLD: Acc', acc_all,
                  '  Total samples', total_samples)

    def run_test(self, graphs, labels):
        batches = dgl.batch(graphs)
        acc, _, logits = self.model.eval_graph_classification(labels, batches)
        _, indices = torch.max(logits, dim=1)
        labels_cpu = labels.cpu()
        indices_cpu = indices.cpu()
        # print('\t [run_test] labels', labels)
        # print('\t [run_test] indices', indices)
        # labels_txt = ['malware', 'benign']

        # print('\t [app][run_test] Total samples', len(labels_cpu))

        # prepend this to make sure cm shape is always (2,2)
        labels_cpu = torch.cat((labels_cpu, torch.tensor([0, 1])), 0)
        indices_cpu = torch.cat((indices_cpu, torch.tensor([0, 1])), 0)

        cm = confusion_matrix(y_true=labels_cpu, y_pred=indices_cpu)
        C = cm / cm.astype(np.float).sum(axis=1)
        # print('\t [app][run_test] confusion_matrix:', cm)

        # if len(self.mapping) == 2:
        #     lbl_mal = self.mapping['malware']
        #     lbl_bng = self.mapping['benign']
        #     n_mal = (labels_cpu == lbl_mal).sum().item()
        #     n_bgn = (labels_cpu == lbl_bng).sum().item()
        #     tpr = cm[lbl_mal][lbl_mal]/n_mal * 100 # actual malware that is correctly detected as malware
        #     far = cm[lbl_bng][lbl_mal]/n_bgn * 100  # benign that is incorrectly labeled as malware
        #     print('\t [app][run_test] TPR', tpr, ' ||  FAR', far, 'n_bgn', n_bgn)
        #     # print('\t [app][run_test] FAR', far, 'n_bgn', n_bgn)

        # fig = plt.figure()
        # ax = fig.add_subplot(111)
        # cax = ax.matshow(cm)
        # plt.title('Confusion matrix of the classifier')
        # fig.colorbar(cax)
        # # ax.set_xticklabels([''] + labels)
        # # ax.set_yticklabels([''] + labels)
        # plt.xlabel('Predicted')
        # plt.ylabel('True')
        # plt.show()

        print("\t [app][run_test] Accuracy {:.4f}".format(acc))

        # acc = np.mean(self.accuracies)

        return acc, cm
Exemplo n.º 15
0
def _train_net(subject,
               model,
               train_loader,
               val_loader,
               loss_function,
               optimizer,
               scheduler=None,
               epochs=500,
               early_stopping=True,
               plot=True,
               track_lr=True,
               pbar=None):
    """
    Main training loop

    Parameters:
     - subject:        Integer, subject ID
     - model:          t.nn.Module (is set to training mode)
     - train_loader:   t.utils.data.DataLoader: training data
     - val_loader:     t.utils.data.DataLoader: validation data
     - loss_function:  function
     - optimizer:      t.optim.Optimizer
     - scheduler:      t.optim.lr_scheduler or None
     - epochs:         Integer, number of epochs to train
     - early_stopping: boolean, if True, store models for all epochs and select the one with the
                       highest validation accuracy
     - plot:           boolean, if True, generate all plots and store on disk
     - pbar:           tqdm progress bar or None, in which case no progress will be displayed
                       (not closed afterwards)

    Returns: (model, metrics, epoch, history)
     - model:   t.nn.Module, trained model
     - metrics: t.tensor, size=[1, 4], accuracy, precision, recall, f1
     - epoch:   integer, always equal to 500 if early stopping is not used
     - history: tuple: (loss, accuracy), where both are t.tensor, size=[2, epochs]

    Notes:
     - Model and data will not be moved to gpu, do this outside of this function.
     - When early_stopping is enabled, this function will store all intermediate models
    """

    # prepare result
    loss = t.zeros((2, epochs))
    accuracy = t.zeros((2, epochs))
    lr = None
    if track_lr:
        lr = t.zeros((epochs))

    # prepare early_stopping
    if early_stopping:
        early_stopping = EarlyStopping()

    use_cuda = model.is_cuda()

    # train model for all epochs
    for epoch in range(epochs):
        # train the model
        train_loss, train_accuracy = _train_epoch(model,
                                                  train_loader,
                                                  loss_function,
                                                  optimizer,
                                                  scheduler=scheduler,
                                                  use_cuda=use_cuda)

        # collect current loss and accuracy
        validation_loss, validation_accuracy = _test_net(model,
                                                         val_loader,
                                                         loss_function,
                                                         train=False,
                                                         use_cuda=use_cuda)
        loss[0, epoch] = train_loss
        loss[1, epoch] = validation_loss
        accuracy[0, epoch] = train_accuracy
        accuracy[1, epoch] = validation_accuracy
        if track_lr:
            lr[epoch] = optimizer.param_groups[0]['lr']

        # do early stopping
        if early_stopping:
            early_stopping.checkpoint(model, loss[1, epoch],
                                      accuracy[1, epoch], epoch)

        if pbar is not None:
            pbar.update()

    # get the best model
    if early_stopping:
        model, best_loss, best_accuracy, best_epoch = early_stopping.use_best_model(
            model)
    else:
        best_epoch = epoch

    # generate plots
    if plot:
        generate_plots(subject, model, val_loader, loss, accuracy, lr=lr)

    metrics = get_metrics_from_model(model, val_loader)

    return model, metrics, best_epoch + 1, (loss, accuracy)
Exemplo n.º 16
0
    def train(self, data_train, data_valid, enable_es=1):

        with tf.Session(graph=self.graph) as session:
            tf.set_random_seed(1234)

            logger = Logger(session, self.summary_dir)
            # here you initialize the tensorflow saver that will be used in saving the checkpoints.
            # max_to_keep: defaults to keeping the 5 most recent checkpoints of your model
            saver = tf.train.Saver()
            self.session = session
            early_stopping = EarlyStopping(name='total loss',
                                           decay_fn=self.decay_fn)

            if (self.restore and self.load(session, saver)):
                num_epochs_trained = self.model_graph.cur_epoch_tensor.eval(
                    session)
                print('EPOCHS trained: ', num_epochs_trained)
            else:
                print('Initizalizing Variables ...')
                tf.global_variables_initializer().run()

            if (self.model_graph.cur_epoch_tensor.eval(session) == self.epochs
                ):
                return

            for cur_epoch in range(
                    self.model_graph.cur_epoch_tensor.eval(session),
                    self.epochs + 1, 1):

                print('EPOCH: ', cur_epoch)
                self.current_epoch = cur_epoch

                loss_tr, recons_tr, L2_loss = self.train_epoch(
                    session, logger, data_train)
                if da.isnan(loss_tr):
                    print(
                        'Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.'
                    )
                    print('Recons: ', recons_tr)
                    sys.exit()

                loss_val, recons_val = self.valid_epoch(
                    session, logger, data_valid)

                print('TRAIN | AE Loss: ', loss_tr, ' | Recons: ', recons_tr,
                      ' | L2_loss: ', L2_loss)
                print('VALID | AE Loss: ', loss_val, ' | Recons: ', recons_val)

                if (cur_epoch == 1) or ((cur_epoch % const.SAVE_EPOCH == 0) and
                                        ((cur_epoch != 0))):
                    self.save(
                        session, saver,
                        self.model_graph.global_step_tensor.eval(session))
                    if self.plot:
                        self.generate_samples(data_train, session, cur_epoch)

                    if self.clustering:
                        self.generate_clusters(logger, cur_epoch, data_train,
                                               data_valid)

                session.run(self.model_graph.increment_cur_epoch_tensor)

                #Early stopping
                if (enable_es == 1 and early_stopping.stop(loss_val)):
                    print('Early Stopping!')
                    break

                if cur_epoch % 50 == 0:
                    if self.colab:
                        self.push_colab()

            self.save(session, saver,
                      self.model_graph.global_step_tensor.eval(session))
            if self.plot:
                self.generate_samples(data_train, session, cur_epoch)

            if self.clustering:
                self.generate_clusters(logger, cur_epoch, data_train,
                                       data_valid)

            if self.colab:
                self.push_colab()
        return
Exemplo n.º 17
0
def sent_clf(dataset, config, opts, transfer=False):
    from logger.experiment import Experiment

    opts.name = config["name"]
    X_train, y_train, _, X_val, y_val, _ = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Load Preprocessed Datasets
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           vocab=vocab,
                           preprocess=preprocessor,
                           vocab_size=config["vocab"]["size"],
                           seq_len=config["data"]["seq_len"])

    print("Building validation dataset...")
    val_set = ClfDataset(X_val,
                         y_val,
                         seq_len=train_set.seq_len,
                         preprocess=preprocessor,
                         vocab=train_set.vocab)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate())
    ####################################################################
    # Model
    ####################################################################
    ntokens = len(train_set.vocab)
    model = Classifier(ntokens, len(set(train_set.labels)), **config["model"])
    model.to(opts.device)

    clf_criterion = nn.CrossEntropyLoss()
    lm_criterion = nn.CrossEntropyLoss(ignore_index=0)

    embed_parameters = filter(lambda p: p.requires_grad,
                              model.embed.parameters())
    bottom_parameters = filter(
        lambda p: p.requires_grad,
        chain(model.bottom_rnn.parameters(), model.vocab.parameters()))
    if config["model"]["has_att"]:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.attention.parameters(),
                  model.classes.parameters()))
    else:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.classes.parameters()))

    embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001)
    rnn_optimizer = optim.ASGD(bottom_parameters)
    top_optimizer = Adam(top_parameters, lr=config["top_lr"])
    ####################################################################
    # Training Pipeline
    ####################################################################

    # Trainer: responsible for managing the training process
    trainer = SentClfTrainer(model,
                             train_loader,
                             val_loader, (lm_criterion, clf_criterion),
                             [embed_optimizer, rnn_optimizer, top_optimizer],
                             config,
                             opts.device,
                             valid_loader_train_set=val_loader_train_dataset,
                             unfreeze_embed=config["unfreeze_embed"],
                             unfreeze_rnn=config["unfreeze_rnn"])

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################

    # exp = Experiment(opts.name, config, src_dirs=opts.source,
    #                  output_dir=EXP_DIR)
    # exp.add_metric("ep_loss_lm", "line", "epoch loss lm",
    #                ["TRAIN", "VAL"])
    # exp.add_metric("ep_loss_cls", "line", "epoch loss class",
    #                ["TRAIN", "VAL"])
    # exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    # exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])
    #
    # exp.add_value("epoch", title="epoch summary")
    # exp.add_value("progress", title="training progress")

    ep_loss_lm = [10000, 10000]
    ep_loss_cls = [10000, 10000]
    ep_f1 = [0, 0]
    ep_acc = [0, 0]
    e_log = 0
    progress = 0
    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {
            "encoder": "bottom_rnn",
            "decoder": "vocab"
        })
        load_state_dict_subset(model, checkpoint["model"])
    print(model)

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(0, config["epochs"]):

        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)
        # exp.update_metric("ep_loss_lm", train_loss[0], "TRAIN")
        ep_loss_lm[0] = train_loss[0]
        # exp.update_metric("ep_loss_lm", val_loss[0], "VAL")
        ep_loss_lm[1] = val_loss[0]
        # exp.update_metric("ep_loss_cls", train_loss[1], "TRAIN")
        # exp.update_metric("ep_loss_cls", val_loss[1], "VAL")
        ep_loss_cls[0] = train_loss[1]
        ep_loss_cls[1] = val_loss[1]

        # exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train),
        #                   "TRAIN")
        ep_f1[0] = f1_macro(y_train, y_pred_train)
        # exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        ep_f1[1] = f1_macro(y, y_pred)

        # exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
        # exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        ep_acc[0] = acc(y_train, y_pred_train)
        ep_acc[1] = acc(y, y_pred)

        # print('Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format(
        #     ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1], ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1]
        # ))
        # epoch_log = exp.log_metrics(["ep_loss_lm", "ep_loss_cls","ep_f1", "ep_acc"])
        epoch_log = 'Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format(
            ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1],
            ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1])
        print(epoch_log)
        # exp.update_value("epoch", epoch_log)
        e_log = epoch_log
        # print('')
        # Save the model if the val loss is the best we've seen so far.
        # if not best_loss or val_loss[1] < best_loss:
        #     best_loss = val_loss[1]
        #     trainer.best_acc = acc(y, y_pred)
        #     trainer.best_f1 = f1_macro(y, y_pred)
        #     trainer.checkpoint(name=opts.name, timestamp=True)
        best_loss = val_loss[1]
        trainer.best_acc = acc(y, y_pred)
        trainer.best_f1 = f1_macro(y, y_pred)
        trainer.checkpoint(name=opts.name, tags=str(epoch))

        # if early_stopping.stop(val_loss[1]):
        #     print("Early Stopping (according to classification loss)....")
        #     break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
Exemplo n.º 18
0
def train(params, m, datas):
  # early stopping
  es = EarlyStopping(mode = 'max', patience = params.cldc_patience)
  # set optimizer
  optimizer = get_optimizer(params, m)
  # get initial parameters
  if params.zs_reg_alpha > 0:
    init_param_dict = {k: v.detach().clone() for k, v in m.named_parameters() if v.requires_grad}

  # training
  train_lang, train_data = get_lang_data(params, datas, training = True)
  # dev & test are in the same lang
  test_lang, test_data = get_lang_data(params, datas)

  n_batch = train_data.train_size // params.cldc_bs if train_data.train_size % params.cldc_bs == 0 else train_data.train_size // params.cldc_bs + 1
  # get the same n_batch for unlabelled data as well
  # batch size for unlabelled data
  rest_cldc_bs = train_data.rest_train_size // n_batch
  # per category
  data_idxs = [list(range(len(train_idx))) for train_idx in train_data.train_idxs]
  rest_data_idxs = list(range(len(train_data.rest_train_idxs)))
 
  # number of iterations
  cur_it = 0
  # write to tensorboard
  writer = SummaryWriter('./history/{}'.format(params.log_path)) if params.write_tfboard else None
  # best dev/test
  bdev = 0
  btest = 0
  # current dev/test
  cdev = 0
  ctest = 0
  dev_class_acc = {}
  test_class_acc = {}
  dev_cm = None
  test_cm = None
  # early stopping warm up flag, start es after train loss below some threshold
  es_flag = False
  # set io function
  out_semicldc = getattr(ios, 'out_semicldc_{}'.format(params.cldc_train_mode))

  for i in range(params.cldc_ep):
    for data_idx in data_idxs:
      shuffle(data_idx)
    shuffle(rest_data_idxs)
    for j in range(n_batch):
      train_idxs = []
      for k, data_idx in enumerate(data_idxs):
        if j < n_batch - 1:
          train_idxs.append(data_idx[int(j * params.cldc_bs * train_data.train_prop[k]): int((j + 1) * params.cldc_bs * train_data.train_prop[k])])
          rest_train_idxs = rest_data_idxs[j * rest_cldc_bs: (j + 1) * rest_cldc_bs]
        elif j == n_batch - 1:
          train_idxs.append(data_idx[int(j * params.cldc_bs * train_data.train_prop[k]):])
          rest_train_idxs = rest_data_idxs[j * rest_cldc_bs:]

      # get batch data
      batch_train, batch_train_lens, batch_train_lb, batch_train_ohlb = get_batch(params, train_idxs, train_data.train_idxs, train_data.train_lens) 
      batch_rest_train, batch_rest_train_lens, batch_rest_train_lb, batch_rest_train_ohlb = get_rest_batch(params, rest_train_idxs, train_data.rest_train_idxs, train_data.rest_train_lens, enumerate_discrete)

      optimizer.zero_grad()
      m.train()

      if i + 1 <= params.cldc_warm_up_ep:
        m.warm_up = True
      else:
        m.warm_up = False

      loss_dict, batch_pred = m(train_lang, 
                                batch_train, batch_train_lens, batch_train_lb, batch_train_ohlb, 
                                batch_rest_train, batch_rest_train_lens, batch_rest_train_lb, batch_rest_train_ohlb)
      # regularization term
      if params.zs_reg_alpha > 0:
        reg_loss = .0
        for k, v in m.named_parameters():
          if k in init_param_dict and v.requires_grad:
            reg_loss += torch.sum((v - init_param_dict[k]) ** 2)
        print(reg_loss.detach())
        reg_loss *= params.zs_reg_alpha / 2
        reg_loss.backward()

      batch_acc, batch_acc_cls = get_classification_report(params, batch_train_lb.data.cpu().numpy(), batch_pred.data.cpu().numpy())

      if loss_dict['L_cldc_loss'] < params.cldc_lossth:
        es_flag = True

      #loss_dict['total_loss'].backward()
      out_semicldc(i, j, n_batch, loss_dict, batch_acc, batch_acc_cls, bdev, btest, cdev, ctest, es.num_bad_epochs)
      
      #torch.nn.utils.clip_grad_norm_(filter(lambda p: p.grad is not None and p.requires_grad, m.parameters()), 5)
      '''
      # debug for gradient
      for p_name, p in m.named_parameters():
        if p.grad is not None and p.requires_grad:
          print(p_name, p.grad.data.norm(2).item())
      '''

      optimizer.step()
      cur_it += 1
      update_tensorboard(params, writer, loss_dict, batch_acc, cdev, ctest, dev_class_acc, test_class_acc, cur_it)
      
      if cur_it % params.CLDC_VAL_EVERY == 0:
        sys.stdout.write('\n') 
        sys.stdout.flush()
        # validation 
        cdev, dev_class_acc, dev_cm = test(params, m, test_data.dev_idxs, test_data.dev_lens, test_data.dev_size, test_data.dev_prop, test_lang, cm = True)
        ctest, test_class_acc, test_cm = test(params, m, test_data.test_idxs, test_data.test_lens, test_data.test_size, test_data.test_prop, test_lang, cm = True)
        print(dev_cm)
        print(test_cm)
        if es.step(cdev):
          print('\nEarly Stoped.')
          # vis
          #if params.cldc_visualize:
            #tsne2d(params, m)
          # vis
          return
        elif es.is_better(cdev, bdev):
          bdev = cdev
          btest = ctest
          #save_model(params, m)
        # reset bad epochs
        if not es_flag:
          es.num_bad_epochs = 0
Exemplo n.º 19
0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    num_labels = NUM_EMO

    vocab_size = VOCAB_SIZE

    print('NUM of VOCAB' + str(vocab_size))
    train_data = EmotionDataLoader(X_train, y_train, PAD_LEN)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

    dev_data = EmotionDataLoader(X_dev, y_dev, PAD_LEN)
    dev_loader = DataLoader(dev_data,
                            batch_size=int(BATCH_SIZE / 3) + 2,
                            shuffle=False)

    test_data = EmotionDataLoader(X_test, y_test, PAD_LEN)
    test_loader = DataLoader(test_data,
                             batch_size=int(BATCH_SIZE / 3) + 2,
                             shuffle=False)

    model = AttentionLSTMClassifier(EMBEDDING_DIM,
                                    HIDDEN_DIM,
                                    vocab_size,
                                    num_labels,
                                    BATCH_SIZE,
                                    att_mode=opt.attention,
                                    soft_last=False,
                                    use_glove=USE_GLOVE,
                                    add_linear=ADD_LINEAR,
                                    max_pool=MAX_POOLING)

    if USE_GLOVE:
        model.load_embedding(tokenizer.get_embeddings())
    # multi-GPU
    # model = nn.DataParallel(model)
    model.cuda()

    if opt.loss == 'ce':
        loss_criterion = nn.CrossEntropyLoss()  #
        print('Using ce loss')
    elif opt.loss == 'focal':
        loss_criterion = FocalLoss(gamma=opt.focal, reduce=True)
        print('Using focal loss, gamma=', opt.focal)
    else:
        raise Exception('loss option not recognised')

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    es = EarlyStopping(patience=PATIENCE)

    old_model = None
    for epoch in range(1, 300):
        print('Epoch: ' + str(epoch) + '===================================')
        train_loss = 0
        model.train()
        for i, (data, seq_len,
                label) in tqdm(enumerate(train_loader),
                               total=len(train_data) / BATCH_SIZE):
            optimizer.zero_grad()

            data_text = [tokenizer.decode_ids(x) for x in data]
            with torch.no_grad():
                character_ids = batch_to_ids(data_text).cuda()
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

                emoji_tokenized, _, _ = st.tokenize_sentences(
                    [' '.join(x) for x in data_text])
                emoji_encoding = emoji_model(
                    torch.LongTensor(emoji_tokenized.astype(np.int32)))

            y_pred = model(data.cuda(), seq_len, elmo_emb,
                           emoji_encoding.cuda())
            loss = loss_criterion(y_pred, label.view(-1).cuda())
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS)
            optimizer.step()
            train_loss += loss.data.cpu().numpy() * data.shape[0]
            del y_pred, loss

        test_loss = 0
        model.eval()
        for _, (_data, _seq_len, _label) in enumerate(dev_loader):
            with torch.no_grad():

                data_text = [tokenizer.decode_ids(x) for x in _data]
                character_ids = batch_to_ids(data_text).cuda()
                elmo_emb = elmo(character_ids)['elmo_representations']
                elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

                emoji_tokenized, _, _ = st.tokenize_sentences(
                    [' '.join(x) for x in data_text])
                emoji_encoding = emoji_model(
                    torch.LongTensor(emoji_tokenized.astype(np.int32)))

                y_pred = model(_data.cuda(), _seq_len, elmo_emb,
                               emoji_encoding.cuda())
                loss = loss_criterion(y_pred, _label.view(-1).cuda())
                test_loss += loss.data.cpu().numpy() * _data.shape[0]
                del y_pred, loss

        print("Train Loss: " + str(train_loss / len(train_data)) + \
              " Evaluation: " + str(test_loss / len(dev_data)))

        if es.step(test_loss):  # overfitting
            del model
            print('overfitting, loading best model ...')
            model = old_model
            break
        else:
            if es.is_best():
                if old_model is not None:
                    del old_model
                print('saving best model ...')
                old_model = deepcopy(model)
            else:
                print('not best model, ignoring ...')
                if old_model is None:
                    old_model = deepcopy(model)

    with open(f'lstm_elmo_deepmoji_{opt.dataset}_model.pt', 'bw') as f:
        torch.save(model.state_dict(), f)

    pred_list = []
    model.eval()
    for _, (_data, _seq_len, _label) in enumerate(test_loader):
        with torch.no_grad():
            data_text = [tokenizer.decode_ids(x) for x in _data]
            character_ids = batch_to_ids(data_text).cuda()
            elmo_emb = elmo(character_ids)['elmo_representations']
            elmo_emb = (elmo_emb[0] + elmo_emb[1]) / 2  # avg of two layers

            emoji_tokenized, _, _ = st.tokenize_sentences(
                [' '.join(x) for x in data_text])
            emoji_encoding = emoji_model(
                torch.LongTensor(emoji_tokenized.astype(np.int32)))

            y_pred = model(_data.cuda(), _seq_len, elmo_emb,
                           emoji_encoding.cuda())
            pred_list.append(
                y_pred.data.cpu().numpy())  # x[np.where( x > 3.0 )]
            del y_pred

    pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)

    return pred_list
Exemplo n.º 20
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN)
        train_data_loader = DataLoader(train_data_set,
                                       batch_size=BATCH_SIZE,
                                       shuffle=True)

        dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        gradient_accumulation_steps = 1
        num_train_steps = int(
            len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps *
            MAX_EPOCH)

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            model = BERT_classifer.from_pretrained(BERT_MODEL)
            model.add_output_layer(BERT_MODEL, NUM_EMO)
            model = nn.DataParallel(model)
            model.cuda()

            # BERT optimizer
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'gamma', 'beta']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay_rate':
                0.0
            }]

            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=learning_rate,
                                 warmup=0.1,
                                 t_total=num_train_steps)

            if opt.w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif opt.w == 2:
                weight_list = [
                    0.3198680179, 0.246494733, 0.2484349259, 1.74527696
                ]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('binary loss reweight = weight_list_binary',
                  weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal, reduce=False)
                loss_criterion_binary = FocalLoss(gamma=opt.focal,
                                                  reduce=False)  #
            elif opt.loss == 'ce':
                loss_criterion = nn.CrossEntropyLoss(reduce=False)
                loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE)
            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_best = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                print('Begin training epoch:', num_epoch)
                sys.stdout.flush()
                train_loss = 0
                model.train()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in tqdm(enumerate(train_data_loader),
                                         total=len(train_data_set) /
                                         BATCH_SIZE):
                    optimizer.zero_grad()

                    if USE_TOKEN_TYPE:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(),
                                                   segments.cuda())
                    else:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda())

                    loss_label = loss_criterion(pred,
                                                e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]

                    loss_binary = loss_criterion_binary(
                        pred2,
                        e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(
                        torch.gather(weight_binary, 0,
                                     e_c_binary.view(-1).cuda()),
                        loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary +
                            LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * tokens.shape[0]

                    del loss, pred

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, pred2, pred3 = model(tokens.cuda(),
                                                       masks.cuda(),
                                                       segments.cuda())
                        else:
                            pred, pred2, pred3 = model(tokens.cuda(),
                                                       masks.cuda())

                        loss_label = loss_criterion(
                            pred,
                            e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                     e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(
                            pred2,
                            e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(
                            torch.gather(weight_binary, 0,
                                         e_c_binary.view(-1).cuda()),
                            loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(
                            pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 *
                                loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * tokens.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss

                # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)
                # gold_list = np.concatenate(gold_list, axis=0)
                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                # checking diverge
                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    if num_epoch == 1:
                        is_diverged = True
                        final_pred_best = deepcopy(final_pred_list_test)
                        pred_list_test_best = deepcopy(pred_list_test)
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                print('Gold Dev ...')
                pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(),
                                               segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        pred_list_test.append(pred.data.cpu().numpy())

                pred_list_test = np.argmax(np.concatenate(pred_list_test,
                                                          axis=0),
                                           axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                print('Gold Test ...')
                final_pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary,
                        e_c_emo) in enumerate(gold_test_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(),
                                               segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        final_pred_list_test.append(pred.data.cpu().numpy())

                final_pred_list_test = np.argmax(np.concatenate(
                    final_pred_list_test, axis=0),
                                                 axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue
            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)

            del model
            break
Exemplo n.º 21
0
    def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        # for one fold, test data comes from k fold split.
        train_data_set = TrainDataSet(X_train,
                                      y_train,
                                      EMAI_PAD_LEN,
                                      SENT_PAD_LEN,
                                      word2id,
                                      use_unk=True)

        dev_data_set = TrainDataSet(X_dev,
                                    y_dev,
                                    EMAI_PAD_LEN,
                                    SENT_PAD_LEN,
                                    word2id,
                                    use_unk=True)
        dev_data_loader = DataLoader(dev_data_set,
                                     batch_size=BATCH_SIZE,
                                     shuffle=False)
        # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        final_pred_best = None

        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            # Model is defined in HierarchicalPredictor
            model = HierarchicalAttPredictor(SENT_EMB_DIM,
                                             SENT_HIDDEN_SIZE,
                                             CTX_LSTM_DIM,
                                             num_of_vocab,
                                             SENT_PAD_LEN,
                                             id2word,
                                             USE_ELMO=True,
                                             ADD_LINEAR=False)
            model.load_embedding(emb)
            model.deepmoji_model.load_specific_weights(
                PRETRAINED_PATH, exclude_names=['output_layer'])
            model.cuda()

            # model = nn.DataParallel(model)
            # model.to(device)

            optimizer = optim.Adam(model.parameters(),
                                   lr=learning_rate,
                                   amsgrad=True)  #
            # optimizer = optim.SGD(model.parameters(), lr=learning_rate)
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                               gamma=GAMMA)

            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            if opt.loss == 'focal':
                loss_criterion = FocalLoss(gamma=opt.focal)

            elif opt.loss == 'ce':
                loss_criterion = nn.BCELoss()

            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_list_test = None

            result_print = {}

            for num_epoch in range(MAX_EPOCH):

                # to ensure shuffle at ever epoch
                train_data_loader = DataLoader(train_data_set,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

                print('Begin training epoch:', num_epoch, end='...\t')
                sys.stdout.flush()

                # stepping scheduler
                scheduler.step(num_epoch)
                print('Current learning rate', scheduler.get_lr())

                ## Training step
                train_loss = 0
                model.train()

                for i, (a, a_len, emoji_a, e_c) \
                        in tqdm(enumerate(train_data_loader), total=len(train_data_set)/BATCH_SIZE):

                    optimizer.zero_grad()
                    e_c = e_c.type(torch.float)
                    pred = model(a.cuda(), a_len, emoji_a.cuda())
                    loss_label = loss_criterion(pred.squeeze(1),
                                                e_c.view(-1).cuda()).cuda()

                    # training trilogy
                    loss_label.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss_label.data.cpu().numpy() * a.shape[0]
                    del pred, loss_label

                ## Evaluatation step
                model.eval()
                dev_loss = 0
                # pred_list = []
                for i, (a, a_len, emoji_a, e_c) in enumerate(dev_data_loader):

                    with torch.no_grad():
                        e_c = e_c.type(torch.float)
                        pred = model(a.cuda(), a_len, emoji_a.cuda())

                        loss_label = loss_criterion(
                            pred.squeeze(1),
                            e_c.view(-1).cuda()).cuda()

                        dev_loss += loss_label.data.cpu().numpy() * a.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss_label

                print('Training loss:',
                      train_loss / len(train_data_set),
                      end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))

                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)

                # Gold Test testing
                print('Final test testing...')
                final_pred_list_test = []
                model.eval()

                for i, (a, a_len,
                        emoji_a) in enumerate(final_test_data_loader):

                    with torch.no_grad():

                        pred = model(a.cuda(), a_len, emoji_a.cuda())

                        final_pred_list_test.append(pred.data.cpu().numpy())
                    del a, pred
                print("final_pred_list_test", len(final_pred_list_test))
                final_pred_list_test = np.concatenate(final_pred_list_test,
                                                      axis=0)
                final_pred_list_test = np.squeeze(final_pred_list_test, axis=1)
                print("final_pred_list_test_concat", len(final_pred_list_test))

                accuracy, precision, recall, f1 = get_metrics(
                    np.asarray(final_test_target_list),
                    np.asarray(final_pred_list_test))

                result_print.update(
                    {num_epoch: [accuracy, precision, recall, f1]})

                if dev_loss / len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)

                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)

            with open(result_path, 'wb') as w:
                pkl.dump(result_print, w)

            if is_diverged:
                print("Reinitialize model ...")
                del model

                continue

            real_test_results.append(np.asarray(final_pred_best))
            # saving model for inference
            torch.save(model.state_dict(), opt.out_path)
            del model
            break
Exemplo n.º 22
0
def train(pairs_batch_train, pairs_batch_dev, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, batch_size,
          num_epochs, device):
    clip = 5.0
    tf_rate = 1
    early_stopping = EarlyStopping(patience=15, verbose=False, delta=0)

    for epoch in range(10):
        encoder.train()
        decoder.train()

        for _, batch in enumerate(pairs_batch_train):
            pad_input_seqs, input_seq_lengths, pad_target_seqs, pad_target_seqs_lengths = batch
            pad_input_seqs, pad_target_seqs = pad_input_seqs.to(
                device), pad_target_seqs.to(device)

            train_loss = 0

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            encoder_output, encoder_hidden = encoder(pad_input_seqs,
                                                     input_seq_lengths)

            decoder_input = torch.ones(batch_size, 1).long().to(device)
            decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True),
                              encoder_hidden[1].sum(0, keepdim=True))

            teacher_forcing = True if random.random() <= tf_rate else False

            if teacher_forcing:
                for i in range(0, pad_target_seqs.size(0)):
                    decoder_output, decoder_hidden = decoder(
                        decoder_input, decoder_hidden, encoder_output)
                    target = pad_target_seqs.squeeze()
                    train_loss += criterion(decoder_output, target[i])
                    decoder_input = pad_target_seqs[i]
            else:
                for i in range(0, pad_target_seqs.size(0)):
                    decoder_output, decoder_hidden = decoder(
                        decoder_input, decoder_hidden, encoder_output)
                    _, topi = decoder_output.topk(1)
                    target = pad_target_seqs.squeeze()
                    train_loss += criterion(decoder_output, target[i])
                    decoder_input = topi.detach()

            train_loss.backward()
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
            encoder_optimizer.step()
            decoder_optimizer.step()

        # CALCULATE EVALUATION
        with torch.no_grad():
            for _, batch in enumerate(pairs_batch_dev):
                encoder.eval()
                decoder.eval()

                pad_input_seqs, input_seq_lengths, pad_target_seqs, pad_target_seqs_lengths = batch
                pad_input_seqs, pad_target_seqs = pad_input_seqs.to(
                    device), pad_target_seqs.to(device)

                dev_loss = 0

                encoder_output, encoder_hidden = encoder(
                    pad_input_seqs, input_seq_lengths)

                decoder_input = torch.ones(batch_size, 1).long().to(device)
                decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True),
                                  encoder_hidden[1].sum(0, keepdim=True))

                teacher_forcing = True if random.random() <= tf_rate else False

                if teacher_forcing:
                    for i in range(0, pad_target_seqs.size(0)):
                        decoder_output, decoder_hidden = decoder(
                            decoder_input, decoder_hidden, encoder_output)
                        target = pad_target_seqs.squeeze()
                        dev_loss += criterion(decoder_output, target[i])
                        decoder_input = pad_target_seqs[i]
                else:
                    for i in range(0, pad_target_seqs.size(0)):
                        decoder_output, decoder_hidden = decoder(
                            decoder_input, decoder_hidden, encoder_output)
                        _, topi = decoder_output.topk(1)
                        target = pad_target_seqs.squeeze()
                        dev_loss += criterion(decoder_output, target[i])
                        decoder_input = topi.detach()

        #early_stopping(complete_loss_dev, (encoder, decoder, encoder_optimizer, decoder_optimizer))
        #if early_stopping.early_stop:
        #    print('Early stopping')
        #    break

        print('[Epoch: %d] train_loss: %.4f    val_loss: %.4f' %
              (epoch + 1, train_loss.item(), dev_loss.item()))
Exemplo n.º 23
0
def train(X_train, y_train, X_dev, y_dev, X_test, y_test):
    train_set = TrainDataReader(X_train, y_train, MAX_LEN_DATA)
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

    dev_set = TrainDataReader(X_dev, y_dev, MAX_LEN_DATA)
    dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE * 3, shuffle=False)

    test_set = TestDataReader(X_test, MAX_LEN_DATA)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE * 3, shuffle=False)

    # Model initialize
    model = BinaryLSTMClassifier(
        emb_dim=SRC_EMB_DIM,
        vocab_size=glove_tokenizer.get_vocab_size(),
        num_label=NUM_EMO,
        hidden_dim=SRC_HIDDEN_DIM,
        attention_mode=ATTENTION,
        args=args
    )

    if args.fix_emb:
        para_group = [
            {'params': [p for n, p in model.named_parameters() if n.startswith("encoder") and
                        not 'encoder.embeddings' in n], 'lr': args.en_lr},
            {'params': [p for n, p in model.named_parameters() if n.startswith("decoder")], 'lr': args.de_lr}]
    else:
        para_group = [
            {'params': [p for n, p in model.named_parameters() if n.startswith("encoder")], 'lr': args.en_lr},
            {'params': [p for n, p in model.named_parameters() if n.startswith("decoder")], 'lr': args.de_lr}]
    loss_criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(para_group)
    if args.scheduler:
        epoch_to_step = int(len(train_set) / BATCH_SIZE)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=WARMUP_EPOCH * epoch_to_step,
            num_training_steps=STOP_EPOCH * epoch_to_step,
            min_lr_ratio=args.min_lr_ratio
        )

    if args.glorot_init:
        logger('use glorot initialization')
        for group in para_group:
            nn_utils.glorot_init(group['params'])

    if args.huang_init:
        nn_utils.huang_init(model.named_parameters(), uniform=not args.normal_init)
    model.load_encoder_embedding(glove_tokenizer.get_embeddings(), fix_emb=args.fix_emb)
    model.cuda()

    # Start training
    EVAL_EVERY = int(len(train_set) / BATCH_SIZE / 4)
    best_model = None
    es = EarlyStopping(patience=PATIENCE)
    update_step = 0
    exit_training = False

    for epoch in range(1, MAX_EPOCH + 1):
        train_pred = []
        train_gold_list = []
        logger('Training on epoch=%d -------------------------' % (epoch))
        train_loss_sum = 0
        # print('Current encoder learning rate', scheduler.get_lr())
        # print('Current decoder learning rate', scheduler.get_lr())
        for i, (src, src_len, trg) in tqdm(enumerate(train_loader), total=int(len(train_set) / BATCH_SIZE)):
            model.train()
            update_step += 1

            # print('i=%d: ' % (i))
            # trg = torch.index_select(trg, 1, torch.LongTensor(list(range(1, len(EMOS)+1))))
            if args.scheduler:
                scheduler.step()

            optimizer.zero_grad()
            
            decoder_logit = model(src.cuda(), src_len.cuda())

            train_pred.append(np.argmax(decoder_logit.data.cpu().numpy(), axis=-1))
            gold = np.asarray(trg)
            trg_index = []
            for i in range(gold.shape[0]):
                train_gold_list.append(gold[i])
            loss = loss_criterion(decoder_logit, trg.view(-1).cuda())
            loss.backward()
            train_loss_sum += loss.data.cpu().numpy() * src.shape[0]

            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIPS)
            optimizer.step()

            if update_step % EVAL_EVERY == 0 and args.eval_every is not None:
                model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_set,
                                                        y_dev)
                if exit_training:
                    break

        logger(f"Training Loss for epoch {epoch}:", train_loss_sum / len(train_set))
        if not train_pred == []:
            print('TRAIN---------: ')
            train_pred = np.concatenate(train_pred, axis=0)
            train_gold_list = np.array(train_gold_list)
            show_classification_report(train_gold_list, train_pred)
        # model, best_model, exit_training = eval(model, best_model, loss_criterion, es, dev_loader, dev_set)
        if exit_training:
            break

    # final_testing
    model.eval()
    preds = []
    logger("Testing:")
    for i, (src, src_len) in tqdm(enumerate(test_loader), total=int(len(test_set) / BATCH_SIZE)):
        with torch.no_grad():
            
            decoder_logit = model(src.cuda(), src_len.cuda())
            preds.append(np.argmax(decoder_logit.data.cpu().numpy(), axis=-1))
            del decoder_logit

    preds = np.concatenate(preds, axis=0)
    gold = np.asarray(y_test)
    #preds = np.argmax(preds, axis=-1)

    logger("NOTE, this is on the test set")
    #metric = get_metrics(gold, preds)
    #logger('Normal: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])
    # metric = get_multi_metrics(binary_gold, binary_preds)
    # logger('Multi only: h_loss:', metric[0], 'macro F', metric[1], 'micro F', metric[4])
    # show_classification_report(binary_gold, binary_preds)
    # logger('Jaccard:', jaccard_score(gold, preds))
    return gold, preds, model
Exemplo n.º 24
0
def train():
    """ train """

    """ construct index-based data loader """
    idx = np.array([i for i in range(args.seq_len + 1, data_obj.num_times)])
    idx_dat = dat.TensorDataset(torch.tensor(idx, dtype=torch.int32))
    train_idx_data_loader = dat.DataLoader(dataset=idx_dat, batch_size=args.batch_size, shuffle=True)

    idx = np.array([i for i in range(args.seq_len + 1, data_obj.num_times)])
    idx_dat = dat.TensorDataset(torch.tensor(idx, dtype=torch.int32))
    test_idx_data_loader = dat.DataLoader(dataset=idx_dat, batch_size=1, shuffle=False)

    """ set writer, loss function, and optimizer """
    mse_loss_func = nn.MSELoss()
    mse_sum_loss_func = nn.MSELoss(reduction='sum')
    spatial_loss_func = SpatialLoss(sp_neighbor=args.sp_neighbor)
    temporal_loss_func = TemporalLoss(tp_neighbor=args.tp_neighbor)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    early_stopping = EarlyStopping(patience=args.patience, verbose=args.verbose)

    def construct_sequence_x(idx_list, dynamic_x, static_x):
        d_x = [dynamic_x[i - args.seq_len + 1: i + 1, ...] for i in idx_list]
        d_x = np.stack(d_x, axis=0)
        s_x = np.expand_dims(static_x, axis=0)
        s_x = np.repeat(s_x, args.seq_len, axis=1)  # shape: (t, c, h, w)
        s_x = np.repeat(s_x, len(idx_list), axis=0)  # shape: (b, t, c, h, w)
        x = np.concatenate([d_x, s_x], axis=2)
        return torch.tensor(x, dtype=torch.float).to(device)

    def construct_y(idx_list, output_y):
        y = [output_y[i] for i in idx_list]
        y = np.stack(y, axis=0)
        return torch.tensor(y, dtype=torch.float).to(device)

    """ training """
    for epoch in range(args.num_epochs):

        model.train()
        total_losses, train_losses, val_losses, l1_losses, ae_losses, sp_losses = 0, 0, 0, 0, 0, 0

        for _, idx in enumerate(train_idx_data_loader):
            batch_idx = idx[0]

            """ construct sequence input """
            batch_x = construct_sequence_x(batch_idx, data_obj.dynamic_x, data_obj.static_x)  # shape: (b, t, c, h, w)
            batch_y = construct_y(batch_idx, data_obj.train_y)  # shape: (b, 1, h, w)
            batch_val_y = construct_y(batch_idx, data_obj.val_y)

            """ start train """
            out, sparse_x, _, de_x, em = model(batch_x)
            train_loss = mse_loss_func(batch_y[~torch.isnan(batch_y)], out[~torch.isnan(batch_y)])
            train_losses += train_loss.item()

            """ add loss according to the model type """
            total_loss = train_loss
            if 'l1' in model_types:
                l1_loss = model.sparse_layer.l1_loss()
                l1_losses += l1_loss.item()
                total_loss += l1_loss * args.alpha

            if 'ae' in model_types:
                ae_loss = mse_sum_loss_func(sparse_x, de_x)
                ae_losses += ae_loss.item()
                total_loss += ae_loss * args.beta

            if 'sp' in model_types:
                sp_loss = spatial_loss_func(out)
                sp_losses += sp_loss.item()
                total_loss += sp_loss * args.gamma

            # if 'vg' in args.model_type:
            #     # 1-step temporal neighboring loss
            #     pre_batch_idx = batch_idx - torch.ones_like(batch_idx)
            #     pre_batch_x = construct_sequence_x(pre_batch_idx, data_obj.dynamic_x,
            #                                        data_obj.static_x)  # x = (b, t, c, h, w)
            #     _, _, _, _, pre_em = model(pre_batch_x)
            #     tp_loss = torch.mean(torch.mean((em - pre_em) ** 2, axis=1))
            #
            #     # 1-step spatial neighboring loss
            #     sp_loss = 0.
            #     sp_loss += torch.mean(torch.mean((em[..., 1:, 1:] - em[..., :-1, :-1]) ** 2, axis=1))
            #     sp_loss += torch.mean(torch.mean((em[..., 1:, :] - em[..., :-1, :]) ** 2, axis=1))
            #     sp_loss += torch.mean(torch.mean((em[..., :, 1:] - em[..., :, :-1]) ** 2, axis=1))
            #     alosses.append(tp_loss.item() + sp_loss.item())
            #     total_loss += (tp_loss + sp_loss) * args.eta

            total_losses += total_loss.item()

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            """ validate """
            val_loss = mse_loss_func(batch_val_y[~torch.isnan(batch_val_y)], out[~torch.isnan(batch_val_y)])
            val_losses += val_loss.item()

        if args.verbose:
            logging.info('Epoch [{}/{}] total_loss = {:.3f}, train_loss = {:.3f}, val_loss = {:.3f}, '
                         'l1_losses = {:.3f}, ae_losses = {:.3f}, sp_losses = {:.3f}.'
                         .format(epoch, args.num_epochs, total_losses, train_losses, val_losses,
                                 l1_losses, ae_losses, sp_losses))

        # write for tensor board visualization
        if args.use_tb:
            tb_writer.add_scalar('data/train_loss', train_losses, epoch)
            tb_writer.add_scalar('data/val_loss', val_losses, epoch)

        # early_stopping
        early_stopping(val_losses, model, model_file)

        # evaluate testing data
        if len(data_obj.test_loc) == 0 and False:

            model.eval()
            prediction = []

            with torch.no_grad():
                for i, data in enumerate(test_idx_data_loader):
                    batch_idx = data[0]
                    batch_x = construct_sequence_x(batch_idx, data_obj.dynamic_x, data_obj.static_x)  # (b, t, c, h, w)
                    out, _, _, _, _ = model(batch_x)
                    prediction.append(out.cpu().data.numpy())

            prediction = np.concatenate(prediction)
            acc = compute_error(data_obj.test_y[args.seq_len + 1:, ...], prediction)

            if args.verbose:
                logging.info('Epoch [{}/{}] testing: rmse = {:.3f}, mape = {:.3f}, r2 = {:.3f}.'
                             .format(epoch, args.num_epochs, *acc))

        if early_stopping.early_stop:
            break
Exemplo n.º 25
0
class App:
    def __init__(self, model, early_stopping=True):
        self.model = model
        if early_stopping:
            self.early_stopping = EarlyStopping(patience=100, verbose=True)

    def train(self, data, config, save_path='', mode=NODE_CLASSIFICATION):

        loss_fcn = torch.nn.CrossEntropyLoss()

        optimizer = torch.optim.Adam(self.model.parameters(),
                                     lr=config['lr'],
                                     weight_decay=config['weight_decay'])

        labels = data[LABELS]
        # initialize graph
        if mode == NODE_CLASSIFICATION:
            train_mask = data[TRAIN_MASK]
            val_mask = data[VAL_MASK]
            dur = []
            for epoch in range(config['n_epochs']):
                self.model.train()
                if epoch >= 3:
                    t0 = time.time()
                # forward
                logits = self.model(None)
                loss = loss_fcn(logits[train_mask], labels[train_mask])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if epoch >= 3:
                    dur.append(time.time() - t0)

                val_acc, val_loss = self.model.eval_node_classification(
                    labels, val_mask)
                print(
                    "Epoch {:05d} | Time(s) {:.4f} | Train loss {:.4f} | Val accuracy {:.4f} | "
                    "Val loss {:.4f}".format(epoch, np.mean(dur), loss.item(),
                                             val_acc, val_loss))

                self.early_stopping(val_loss, self.model, save_path)

                if self.early_stopping.early_stop:
                    print("Early stopping")
                    break

        elif mode == GRAPH_CLASSIFICATION:
            self.accuracies = np.zeros(10)
            graphs = data[GRAPH]  # load all the graphs
            for k in range(10):  # 10-fold cross validation
                start = int(len(graphs) / 10) * k
                end = int(len(graphs) / 10) * (k + 1)

                # testing batch
                testing_graphs = graphs[start:end]
                self.testing_labels = labels[start:end]
                self.testing_batch = dgl.batch(testing_graphs)

                # training batch
                training_graphs = graphs[:start] + graphs[end + 1:]
                training_labels = labels[list(range(0, start)) +
                                         list(range(end + 1, len(graphs)))]
                training_samples = list(
                    map(list, zip(training_graphs, training_labels)))
                training_batches = DataLoader(training_samples,
                                              batch_size=config['batch_size'],
                                              shuffle=True,
                                              collate_fn=collate)

                dur = []
                for epoch in range(config['n_epochs']):
                    self.model.train()
                    if epoch >= 3:
                        t0 = time.time()
                    losses = []
                    training_accuracies = []
                    for iter, (bg, label) in enumerate(training_batches):
                        logits = self.model(bg)
                        loss = loss_fcn(logits, label)
                        losses.append(loss.item())
                        _, indices = torch.max(logits, dim=1)
                        correct = torch.sum(indices == label)
                        training_accuracies.append(correct.item() * 1.0 /
                                                   len(label))

                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                    if epoch >= 3:
                        dur.append(time.time() - t0)
                    val_acc, val_loss = self.model.eval_graph_classification(
                        self.testing_labels, self.testing_batch)
                    print(
                        "Epoch {:05d} | Time(s) {:.4f} | Train acc {:.4f} | Train loss {:.4f} "
                        "| Val accuracy {:.4f} | Val loss {:.4f}".format(
                            epoch,
                            np.mean(dur) if dur else 0,
                            np.mean(training_accuracies), np.mean(losses),
                            val_acc, val_loss))

                    is_better = self.early_stopping(val_loss, self.model,
                                                    save_path)
                    if is_better:
                        self.accuracies[k] = val_acc

                    if self.early_stopping.early_stop:
                        print("Early stopping")
                        break
                self.early_stopping.reset()
        else:
            raise RuntimeError

    def test(self, data, load_path='', mode=NODE_CLASSIFICATION):

        try:
            print('*** Load pre-trained model ***')
            self.model = load_checkpoint(self.model, load_path)
        except ValueError as e:
            print('Error while loading the model.', e)

        if mode == NODE_CLASSIFICATION:
            test_mask = data[TEST_MASK]
            labels = data[LABELS]
            acc, _ = self.model.eval_node_classification(labels, test_mask)
        else:
            acc = np.mean(self.accuracies)

        print("\nTest Accuracy {:.4f}".format(acc))

        return acc
Exemplo n.º 26
0
    def train(self, data_train, data_valid, enable_es=1):

        with tf.Session(graph=self.graph) as session:
            tf.set_random_seed(1234)

            logger = Logger(session, self.summary_dir)
            # here you initialize the tensorflow saver that will be used in saving the checkpoints.
            # max_to_keep: defaults to keeping the 5 most recent checkpoints of your model
            saver = tf.train.Saver()
            early_stopping = EarlyStopping()

            if (self.restore == 1 and self.load(session, saver)):
                num_epochs_trained = self.model_graph.cur_epoch_tensor.eval(
                    session)
                print('EPOCHS trained: ', num_epochs_trained)
            else:
                print('Initizalizing Variables ...')
                tf.global_variables_initializer().run()

            if (self.model_graph.cur_epoch_tensor.eval(session) == self.epochs
                ):
                return

            for cur_epoch in range(
                    self.model_graph.cur_epoch_tensor.eval(session),
                    self.epochs + 1, 1):

                print('EPOCH: ', cur_epoch)
                self.current_epoch = cur_epoch
                # beta=utils.sigmoid(cur_epoch- 50)
                beta = 1.
                losses, recons, cond_prior, KL_w, y_prior, L2_loss = self.train_epoch(
                    session, logger, data_train, beta=beta)
                train_string = 'TRAIN | Loss: ' + str(losses) + \
                            ' | Recons: ' + str(recons) + \
                            ' | CP: ' + str(cond_prior) + \
                            ' | KL_w: ' + str(KL_w) + \
                            ' | KL_y: ' + str(y_prior) + \
                            ' | L2_loss: '+  str(L2_loss)
                # train_string = colored(train_string, 'red', attrs=['reverse', 'blink'])
                train_string = colored(train_string, 'red')
                if np.isnan(losses):
                    print(
                        'Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.'
                    )
                    print('Recons: ', recons)
                    print('CP: ', cond_prior)
                    print('KL_w: ', KL_w)
                    print('KL_y: ', y_prior)
                    sys.exit()

                loss_val, recons, cond_prior, KL_w, y_prior, L2_loss = self.valid_epoch(
                    session, logger, data_valid, beta=beta)
                valid_string = 'VALID | Loss: ' + str(loss_val) + \
                            ' | Recons: ' + str(recons) + \
                            ' | CP: ' + str(cond_prior) + \
                            ' | KL_w: ' + str(KL_w) + \
                            ' | KL_y: ' + str(y_prior) + \
                            ' | L2_loss: '+  str(L2_loss)

                print(train_string)
                print(valid_string)

                if (cur_epoch > 0 and cur_epoch % 10 == 0):
                    self.save(
                        session, saver,
                        self.model_graph.global_step_tensor.eval(session))

                session.run(self.model_graph.increment_cur_epoch_tensor)

                #Early stopping
                if (enable_es == 1 and early_stopping.stop(loss_val)):
                    print('Early Stopping!')
                    break

            self.save(session, saver,
                      self.model_graph.global_step_tensor.eval(session))

        return
    def fit(self, X, y=None):
        print('\nProcessing data...')

        self.data_train = data_utils.process_data(X, y, test_size=0)
        if self.config.plot:
            self.data_plot = self.data_train

        self.config.num_batches = self.data_train.num_batches(
            self.config.batch_size)

        if not self.config.isBuilt:
            self.config.restore = True
            self.build_model(self.data_train.height, self.data_train.width,
                             self.data_train.num_channels)
        else:
            assert (self.config.height == self.data_train.height) and (self.config.width == self.data_train.width) and \
                   (self.config.num_channels == self.data_train.num_channels), \
                    'Wrong dimension of data. Expected shape {}, and got {}'.format((self.config.height,self.config.width, \
                                                                                     self.config.num_channels), \
                                                                                    (self.data_train.height,
                                                                                     self.data_train.width, \
                                                                                     self.data_train.num_channels) \
                                                                                    )
        ''' 
         -------------------------------------------------------------------------------
                                        TRAIN THE MODEL
        ------------------------------------------------------------------------------------- 
        '''
        print('\nTraining a model...')
        with tf.Session(graph=self.graph) as session:
            tf.set_random_seed(self.config.seeds)
            self.session = session
            logger = Logger(self.session, self.config.log_dir)
            saver = tf.train.Saver()

            early_stopper = EarlyStopping(name='total loss',
                                          decay_fn=self.decay_fn)

            if (self.config.restore and self.load(self.session, saver)):
                load_config = file_utils.load_args(self.config.model_name,
                                                   self.config.config_dir)
                self.config.update(load_config)

                num_epochs_trained = self.model_graph.cur_epoch_tensor.eval(
                    self.session)
                print('EPOCHS trained: ', num_epochs_trained)
            else:
                print('Initializing Variables ...')
                tf.global_variables_initializer().run()

            for cur_epoch in range(
                    self.model_graph.cur_epoch_tensor.eval(self.session),
                    self.config.epochs + 1, 1):
                print('EPOCH: ', cur_epoch)
                self.current_epoch = cur_epoch

                losses_tr = self._train(self.data_train, self.session, logger)

                if np.isnan(losses_tr[0]):
                    print(
                        'Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.'
                    )
                    for lname, lval in zip(self.model_graph.losses, losses_tr):
                        print(lname, lval)
                    sys.exit()

                train_msg = 'TRAIN: \n'
                for lname, lval in zip(self.model_graph.losses, losses_tr):
                    train_msg += str(lname) + ': ' + str(lval) + ' | '

                print(train_msg)
                print()

                if (cur_epoch
                        == 1) or ((cur_epoch % self.config.save_epoch == 0) and
                                  (cur_epoch != 0)):
                    gc.collect()
                    self.save(
                        self.session, saver,
                        self.model_graph.global_step_tensor.eval(self.session))
                    if self.config.plot:
                        self.plot_latent(cur_epoch)

                self.session.run(self.model_graph.increment_cur_epoch_tensor)

                # Early stopping
                if (self.config.early_stopping
                        and early_stopper.stop(losses_tr[0])):
                    print('Early Stopping!')
                    break

                if cur_epoch % self.config.colab_save == 0:
                    if self.config.colab:
                        self.push_colab()

            self.save(self.session, saver,
                      self.model_graph.global_step_tensor.eval(self.session))
            if self.config.plot:
                self.plot_latent(cur_epoch)

            if self.config.colab:
                self.push_colab()

        return
Exemplo n.º 28
0
 def __init__(self, model, early_stopping=True):
     self.model = model
     if early_stopping:
         self.early_stopping = EarlyStopping(patience=100, verbose=True)
Exemplo n.º 29
0
def main():
    args = parse_args()
    config_path = args.config_file_path

    config = get_config(config_path, new_keys_allowed=True)

    config.defrost()
    config.experiment_dir = os.path.join(config.log_dir, config.experiment_name)
    config.tb_dir = os.path.join(config.experiment_dir, 'tb')
    config.model.best_checkpoint_path = os.path.join(config.experiment_dir, 'best_checkpoint.pt')
    config.model.last_checkpoint_path = os.path.join(config.experiment_dir, 'last_checkpoint.pt')
    config.config_save_path = os.path.join(config.experiment_dir, 'segmentation_config.yaml')
    config.freeze()

    init_experiment(config)
    set_random_seed(config.seed)

    train_dataset = make_dataset(config.train.dataset)
    train_loader = make_data_loader(config.train.loader, train_dataset)

    val_dataset = make_dataset(config.val.dataset)
    val_loader = make_data_loader(config.val.loader, val_dataset)

    device = torch.device(config.device)
    model = make_model(config.model).to(device)

    optimizer = make_optimizer(config.optim, model.parameters())
    scheduler = None

    loss_f = make_loss(config.loss)

    early_stopping = EarlyStopping(
        **config.stopper.params
    )

    train_writer = SummaryWriter(log_dir=os.path.join(config.tb_dir, 'train'))
    val_writer = SummaryWriter(log_dir=os.path.join(config.tb_dir, 'val'))

    for epoch in range(1, config.epochs + 1):
        print(f'Epoch {epoch}')
        train_metrics = train(model, optimizer, train_loader, loss_f, device)
        write_metrics(epoch, train_metrics, train_writer)
        print_metrics('Train', train_metrics)

        val_metrics = val(model, val_loader, loss_f, device)
        write_metrics(epoch, val_metrics, val_writer)
        print_metrics('Val', val_metrics)

        early_stopping(val_metrics['loss'])
        if config.model.save and early_stopping.counter == 0:
            torch.save(model.state_dict(), config.model.best_checkpoint_path)
            print('Saved best model checkpoint to disk.')
        if early_stopping.early_stop:
            print(f'Early stopping after {epoch} epochs.')
            break

        if scheduler:
            scheduler.step()

    train_writer.close()
    val_writer.close()

    if config.model.save:
        torch.save(model.state_dict(), config.model.last_checkpoint_path)
        print('Saved last model checkpoint to disk.')
Exemplo n.º 30
0
def train(pairs_batch_train, pairs_batch_dev, encoder, decoder,
          encoder_optimizer, decoder_optimizer, criterion, ctc_loss,
          batch_size, num_epochs, device, train_data_len, dev_data_len):
    clip = 1.0
    tf_rate = 1
    lambda_factor = 0.8
    early_stopping = EarlyStopping(patience=10, verbose=False, delta=0)

    for epoch in range(1, 10):
        encoder.train()
        decoder.train()

        batch_loss_train = 0
        batch_loss_dev = 0

        for iteration, batch in enumerate(pairs_batch_train):

            pad_input_seqs, input_seq_lengths, pad_target_seqs, target_seq_lengths = batch
            pad_input_seqs, pad_target_seqs = pad_input_seqs.to(
                device), pad_target_seqs.to(device)

            train_loss = 0

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            #print(torch.isnan(pad_input_seqs).any())
            encoder_output, encoder_hidden, encoder_output_prob = encoder(
                pad_input_seqs, input_seq_lengths)

            decoder_input = torch.ones(batch_size, 1).long().to(device)
            decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True),
                              encoder_hidden[1].sum(0, keepdim=True))

            teacher_forcing = True if random.random() <= tf_rate else False

            attn_weights = F.softmax(torch.ones(encoder_output.size(1), 1,
                                                encoder_output.size(0)),
                                     dim=-1).to(device)

            if teacher_forcing:
                for i in range(0, pad_target_seqs.size(0)):
                    decoder_output, decoder_hidden, attn_weights = decoder(
                        decoder_input, decoder_hidden, encoder_output,
                        attn_weights)
                    target = pad_target_seqs.squeeze()
                    train_loss += criterion(decoder_output, target[i])
                    decoder_input = pad_target_seqs[i].detach()
            else:
                for i in range(0, pad_target_seqs.size(0)):
                    decoder_output, decoder_hidden = decoder(
                        decoder_input, decoder_hidden, encoder_output)
                    _, topi = decoder_output.topk(1)
                    target = pad_target_seqs.squeeze()
                    train_loss += criterion(decoder_output, target[i])
                    decoder_input = topi.detach()

            # CTC LOSS
            targets = pad_target_seqs.squeeze().permute(1, 0)
            input_lengths = torch.ones(
                encoder_output_prob.size(1)) * encoder_output_prob.size(0)
            input_lengths = input_lengths.type(torch.LongTensor)
            target_seq_lengths = np.array(target_seq_lengths)
            target_lengths = torch.from_numpy(target_seq_lengths)

            train_loss_ctc = ctc_loss(encoder_output_prob, targets,
                                      input_lengths, target_lengths)

            loss = (0.8 * train_loss) + (0.2 * train_loss_ctc)
            #loss = train_loss
            batch_loss_train += loss.data

            ## backward step
            loss.backward()

            torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

            encoder_optimizer.step()
            decoder_optimizer.step()

        # CALCULATE EVALUATION
        with torch.no_grad():
            encoder.eval()
            decoder.eval()

            for _, batch in enumerate(pairs_batch_dev):
                pad_input_seqs, input_seq_lengths, pad_target_seqs, target_seq_lengths = batch
                pad_input_seqs, pad_target_seqs = pad_input_seqs.to(
                    device), pad_target_seqs.to(device)

                dev_loss = 0

                encoder_output, encoder_hidden, encoder_output_prob = encoder(
                    pad_input_seqs, input_seq_lengths)
                decoder_input = torch.ones(batch_size, 1).long().to(device)
                decoder_hidden = (encoder_hidden[0].sum(0, keepdim=True),
                                  encoder_hidden[1].sum(0, keepdim=True))

                teacher_forcing = True if random.random() <= tf_rate else False

                attn_weights = F.softmax(torch.ones(encoder_output.size(1), 1,
                                                    encoder_output.size(0)),
                                         dim=-1).to(device)

                if teacher_forcing:
                    for i in range(0, pad_target_seqs.size(0)):
                        decoder_output, decoder_hidden, attn_weights = decoder(
                            decoder_input, decoder_hidden, encoder_output,
                            attn_weights)
                        target = pad_target_seqs.squeeze()
                        dev_loss += criterion(decoder_output, target[i])
                        decoder_input = pad_target_seqs[i].detach()
                else:
                    for i in range(0, pad_target_seqs.size(0)):
                        decoder_output, decoder_hidden = decoder(
                            decoder_input, decoder_hidden, encoder_output)
                        _, topi = decoder_output.topk(1)
                        target = pad_target_seqs.squeeze()
                        dev_loss += criterion(decoder_output, target[i])
                        decoder_input = topi.detach()

                # CTC LOSS
                targets = pad_target_seqs.squeeze().permute(1, 0)
                input_lengths = torch.ones(
                    encoder_output_prob.size(1)) * encoder_output_prob.size(0)
                input_lengths = input_lengths.type(torch.LongTensor)
                target_seq_lengths = np.array(target_seq_lengths)
                target_lengths = torch.from_numpy(target_seq_lengths)
                dev_loss_ctc = ctc_loss(encoder_output_prob, targets,
                                        input_lengths, target_lengths)

                loss_dev = (0.8 * dev_loss) + (0.2 * dev_loss_ctc)
                #loss_dev = dev_loss
                batch_loss_dev += loss_dev.data

        print('[Epoch: %d] train_loss: %.4f    val_loss: %.4f' %
              (epoch + 1,
               (batch_loss_train.item() / (train_data_len / batch_size)),
               (batch_loss_dev.item() / (dev_data_len / batch_size))))

        with open('loss/english_asr_finetuned.txt', 'a') as f:
            f.write(
                str(epoch + 1) + '	' + str(batch_loss_train.item() /
                                           (train_data_len / batch_size)) +
                '  ' + str(batch_loss_dev.item() /
                           (dev_data_len / batch_size)) + '\n')

        print('saving the models...')
        torch.save(
            {
                'encoder': encoder.state_dict(),
                'decoder': decoder.state_dict(),
                'encoder_optimizer': encoder_optimizer.state_dict(),
                'decoder_optimizer': decoder_optimizer.state_dict(),
            }, 'weights/english_asr_finetuned/state_dict_' + str(epoch + 1) +
            '.pt')