예제 #1
0
def StartTraining():

    training_Canvas = tk.Label(window)
    trainingAndVal_Canvas = tk.Label(window)

    resultTextbox.insert("end", "Training Start\n")

    window.update_idletasks()

    ##### Init Args

    args.dataset = datasetSelected.get()

    args.vocab_size = int(vocab_size.get())
    args.validation_portion = float(validation_portion.get())
    args.test_portion = float(test_portion.get())
    args.batch_size = int(batch_size.get())
    args.L2 = float(L2.get())
    args.lr = float(lr.get())
    args.n_epoch = int(n_epoch.get())
    args.earlyStopStep = int(earlyStopStep.get())
    args.earlyStopEpoch = int(earlyStopEpoch.get())
    args.val_freq = int(val_freq.get())
    args.val_steps = int(val_steps.get())
    args.log_freq = int(log_freq.get())
    args.GatedCNN_embedingDim = int(GatedCNN_embedingDim.get())
    args.GatedCNN_convDim = int(GatedCNN_convDim.get())
    args.GatedCNN_kernel = int(GatedCNN_kernel.get())
    args.GatedCNN_stride = int(GatedCNN_stride.get())
    args.GatedCNN_pad = int(GatedCNN_pad.get())
    args.GatedCNN_layers = int(GatedCNN_layers.get())
    args.GatedCNN_dropout = float(GatedCNN_dropout.get())
    args.SSCL_embedingDim = int(SSCL_embedingDim.get())
    args.SSCL_RNNHidden = int(SSCL_RNNHidden.get())
    args.SSCL_CNNDim = int(SSCL_CNNDim.get())
    args.SSCL_CNNKernel = int(SSCL_CNNKernel.get())
    args.SSCL_CNNDropout = float(SSCL_CNNDropout.get())
    args.SSCL_LSTMDropout = float(SSCL_LSTMDropout.get())
    args.SSCL_LSTMLayers = int(SSCL_LSTMLayers.get())
    args.SelfAttn_LenMaxSeq = int(SelfAttn_LenMaxSeq.get())
    args.SelfAttn_ModelDim = int(SelfAttn_ModelDim.get())
    args.SelfAttn_FFInnerDim = int(SelfAttn_FFInnerDim.get())
    args.SelfAttn_NumLayers = int(SelfAttn_NumLayers.get())
    args.SelfAttn_NumHead = int(SelfAttn_NumHead.get())
    args.SelfAttn_KDim = int(SelfAttn_KDim.get())
    args.SelfAttn_VDim = int(SelfAttn_VDim.get())
    args.SelfAttn_Dropout = float(SelfAttn_Dropout.get())

    args.vocab_size = int(vocab_size.get())
    args.validation_portion = float(validation_portion.get())
    args.test_portion = float(test_portion.get())
    args.batch_size = int(batch_size.get())
    args.L2 = float(L2.get())
    args.lr = float(lr.get())
    args.n_epoch = int(n_epoch.get())
    args.earlyStopStep = int(earlyStopStep.get())
    args.earlyStopEpoch = int(earlyStopEpoch.get())
    args.val_freq = int(val_freq.get())
    args.val_steps = int(val_steps.get())
    args.log_freq = int(log_freq.get())
    args.model_name = model_name.get()

    args.model_path = './' + args.dataset + '_Log/' + args.model_name + '/Model/'
    args.log_path = './' + args.dataset + '_Log/' + args.model_name + '/Log/'

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    args.device = device

    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    if not os.path.exists(args.log_path):
        os.makedirs(args.log_path)

    training_dataset, validation_dataset, test_dataset, text = TkloadingData(
        args, resultTextbox, window)
    resultTextbox.see("end")

    args.numberOfSpammer = sum([t[-1] for t in training_dataset])
    args.numberOfNonSpammer = len(training_dataset) - args.numberOfSpammer
    args.len_max_seq = training_dataset[0][2]

    resultTextbox.insert(
        "end",
        ("Number of Spammer: " + str(args.numberOfSpammer.item()) + "\n"))
    resultTextbox.insert("end", ("Number of NonSpammer: " +
                                 str(args.numberOfNonSpammer.item()) + "\n"))

    window.update_idletasks()

    if args.usingWeightRandomSampling:
        sampler = getSampler(training_dataset)
    else:
        sampler = None

    train_loader = DataLoader(training_dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              drop_last=False,
                              sampler=sampler)
    valid_loader = DataLoader(validation_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              drop_last=False)

    if modelSelected.get() == 'SSCL':
        resultTextbox.insert("end", "Using SSCL\n")
        trainModel = SSCL
    elif modelSelected.get() == 'GatedCNN':
        resultTextbox.insert("end", "Using GatedCNN\n")
        trainModel = GatedCNN
    elif modelSelected.get() == 'SelfAttn':
        resultTextbox.insert("end", "Using SelfAttn\n")
        trainModel = SelfAttnModel
    else:
        resultTextbox.insert("end", 'No Support For this Model')
        raise ValueError

    window.update_idletasks()

    trainer = TkTrainer(trainModel, args, resultTextbox, window).to(device)

    resultTextbox.insert("end", ("Number of Parameters in this Model: " +
                                 str(trainer.num_all_params()) + "\n"))
    resultTextbox.insert("end", ("Using device: " + str(device) + "\n"))
    window.update_idletasks()

    scheduler = optim.lr_scheduler.StepLR(trainer.optim, 2000, gamma=0.85)
    # trainer.optim.param_groups[0]['lr']=
    allStep = 0
    epoch = 0

    resultTextbox.insert("end",
                         ("Model Structure: \n" + str(trainer.model) + "\n"))

    resultTextbox.see("end")

    window.update_idletasks()

    while epoch < args.n_epoch:
        for i, (texts, X, X_len, y) in enumerate(train_loader):

            trainer.train()
            X, X_len, y = X.to(device), X_len.to(device), y.to(device)

            if trainer.optim.param_groups[0]['lr'] >= 0.00001:
                scheduler.step()
            start_t = time.time()
            #         trainer.train_step((X, X_len), y)
            trainer.train_step(X, y)

            end_t = time.time()
            allStep += 1
            resultTextbox.insert("end", (
                '| Epoch [%d] | Step [%d] | lr [%.6f] | Loss: [%.4f] | Acc: [%.4f] | Time: %.1fs \n'
                % (epoch, allStep, trainer.optim.param_groups[0]['lr'],
                   trainer.loss.item(), trainer.accuracy.item(),
                   end_t - start_t)))
            window.update_idletasks()
            resultTextbox.see("end")

            #         if trainer.accuracy.item() > 0.95: # Stop early
            #             resultTextbox.insert("end", "Train Accuracy Reach the Stop Accuracy ")
            #             raise StopIteration
            if allStep % args.log_freq == 0:

                #################################################

                trainer.plot_train_hist(args.model_name)

                TrainImg = ImageTk.PhotoImage(
                    Image.open(args.log_path + "Train_Loss&Acc_Hist_" +
                               str(args.model_name) + ".png").resize(
                                   (500, 600), Image.ANTIALIAS))

                training_Canvas.config(image=TrainImg)

                training_Canvas.image = TrainImg

                training_Canvas.place(x=0, y=450, anchor="nw")

                window.update_idletasks()

            if args.earlyStopStep:
                if allStep >= args.earlyStopStep:
                    resultTextbox.insert("end", "EarlyStopStep Reach")
                    break

            if allStep % args.val_freq == 0:

                for _ in range(args.val_steps):
                    trainer.eval()
                    stIdx = np.random.randint(
                        0,
                        len(validation_dataset) - args.batch_size)
                    v_text, v_X, v_X_len, v_y = validation_dataset[
                        stIdx:stIdx + args.batch_size]
                    v_X, v_X_len, v_y = v_X.to(device), v_X_len.to(
                        device), v_y.to(device)
                    start_t = time.time()
                    #                 trainer.test_step((v_X, v_X_len), v_y)
                    trainer.test_step(v_X, v_y)
                    end_t = time.time()
                    resultTextbox.insert("end", (
                        '| Epoch [%d] | Validation | Step [%d] |  Loss: [%.4f] | Acc: [%.4f] | Time: %.1fs \n'
                        % (epoch, allStep, trainer.loss.item(),
                           trainer.accuracy.item(), end_t - start_t)))
                    window.update_idletasks()
                    resultTextbox.see("end")

                trainer.calculateAverage()
                clear_output()
                resultTextbox.insert("end", ("TrainConfusion Matrix: \n"))
                resultTextbox.insert("end",
                                     pd.DataFrame(trainer.cms['Train'][-1]))
                resultTextbox.insert("end", "\n\n")
                resultTextbox.insert("end", ("ValidationConfusion Matrix: \n"))
                resultTextbox.insert("end",
                                     pd.DataFrame(trainer.cms['Val'][-1]))
                resultTextbox.insert("end", "\n\n")
                window.update_idletasks()
                resultTextbox.see("end")

                #################################################

                trainer.plot_all(args.model_name)

                TrainAndValImg = ImageTk.PhotoImage(
                    Image.open(args.log_path + "All_Hist_" +
                               str(args.model_name) + ".png").resize(
                                   (500, 600), Image.ANTIALIAS))

                trainingAndVal_Canvas.config(image=TrainAndValImg)

                trainingAndVal_Canvas.image = TrainAndValImg

                trainingAndVal_Canvas.place(x=500, y=450, anchor="nw")

                window.update_idletasks()

        epoch += 1
        trainer.model_save(epoch)

        if args.earlyStopEpoch:
            if epoch >= args.earlyStopEpoch:
                resultTextbox.insert("end", "EarlyStopEpoch Reach")
                break

    test_text, test_X, test_X_len, test_y = zip(test_dataset[0:])
    test_text, test_X, test_X_len, test_y = test_text[0], test_X[0].to(
        device), test_X_len[0].to(device), test_y[0].to(device)
    test_loss, test_accuracy, test_cm = trainer.test_step(test_X, test_y)

    resultTextbox.insert("end", (
        "\n\n========================================================================================="
    ))
    resultTextbox.insert("end",
                         ("\nThe Test Loss: " + str(test_loss.item()) + "\n"))
    resultTextbox.insert(
        "end", ("The Test Accuracy: " + str(test_accuracy.item()) + "\n"))
    resultTextbox.insert("end", ("Test Confusion Matrix: \n"))
    resultTextbox.insert("end", (pd.DataFrame(test_cm)))
    resultTextbox.insert("end", "\n\n")
    resultTextbox.see("end")
예제 #2
0
def StartTraining():

    training_Canvas = tk.Label(window)
    trainingAndVal_Canvas = tk.Label(window)

    resultTextbox.insert("end", "Training Start\n")
    window.update_idletasks()
    resultTextbox.see("end")

    # Init Args

    args.dataset = datasetSelected.get()

    args.vocab_size = int(vocab_size.get())
    args.validation_portion = float(validation_portion.get())
    args.test_portion = float(test_portion.get())
    args.batch_size = int(batch_size.get())
    args.L2 = float(L2.get())
    args.lr = float(lr.get())
    args.n_epoch = int(n_epoch.get())
    args.earlyStopStep = int(earlyStopStep.get())
    args.earlyStopEpoch = int(earlyStopEpoch.get())
    args.val_freq = int(val_freq.get())
    args.val_steps = int(val_steps.get())
    args.log_freq = int(log_freq.get())
    args.model_name = model_name.get()
    args.scheduler_step = int(scheduler_step.get())
    args.scheduler_gamma = float(scheduler_gamma.get())
    args.scheduler_minLr = float(scheduler_minLr.get())

    args.MultiTask_FCHidden = int(MultiTask_FCHidden.get())
    args.textModel_outDim = int(textModel_outDim.get())
    args.infoModel_outDim = int(infoModel_outDim.get())
    args.combine_dim = int(combine_dim.get())

    args.usingWeightRandomSampling = bool(usingWeightRandomSampling.get())
    args.runningOnSmallDataset = bool(runningOnSmallDataset.get())

    args.GatedCNN_embedingDim = int(GatedCNN_embedingDim.get())
    args.GatedCNN_convDim = int(GatedCNN_convDim.get())
    args.GatedCNN_kernel = int(GatedCNN_kernel.get())
    args.GatedCNN_stride = int(GatedCNN_stride.get())
    args.GatedCNN_pad = int(GatedCNN_pad.get())
    args.GatedCNN_layers = int(GatedCNN_layers.get())
    args.GatedCNN_dropout = float(GatedCNN_dropout.get())
    args.SSCL_embedingDim = int(SSCL_embedingDim.get())
    args.SSCL_RNNHidden = int(SSCL_RNNHidden.get())
    args.SSCL_CNNDim = int(SSCL_CNNDim.get())
    args.SSCL_CNNKernel = int(SSCL_CNNKernel.get())
    args.SSCL_CNNDropout = float(SSCL_CNNDropout.get())
    args.SSCL_LSTMDropout = float(SSCL_LSTMDropout.get())
    args.SSCL_LSTMLayers = int(SSCL_LSTMLayers.get())
    args.SelfAttn_LenMaxSeq = int(SelfAttn_LenMaxSeq.get())
    args.SelfAttn_ModelDim = int(SelfAttn_ModelDim.get())
    args.SelfAttn_WordVecDim = args.SelfAttn_ModelDim
    args.SelfAttn_FFInnerDim = int(SelfAttn_FFInnerDim.get())
    args.SelfAttn_NumLayers = int(SelfAttn_NumLayers.get())
    args.SelfAttn_NumHead = int(SelfAttn_NumHead.get())
    args.SelfAttn_KDim = int(SelfAttn_KDim.get())
    args.SelfAttn_VDim = int(SelfAttn_VDim.get())
    args.SelfAttn_Dropout = float(SelfAttn_Dropout.get())

    args.model_path = './' + args.dataset + '_Log/' + args.model_name + '/Model/'
    args.log_path = './' + args.dataset + '_Log/' + args.model_name + '/Log/'
    
    ### When using the small dataset we change the name
    
    if args.runningOnSmallDataset:
        args.pickle_name = 'Small' + args.original_name
    else:
        args.pickle_name = args.original_name
    

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    args.device = device

    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    if not os.path.exists(args.log_path):
        os.makedirs(args.log_path)

    training_dataset, validation_dataset, test_dataset, text = TkloadingTweetsAndUserInfoData(
        args, resultTextbox, window)
    resultTextbox.see("end")

    args.numberOfSpammer = sum([t[-1] for t in training_dataset])
    args.numberOfNonSpammer = len(training_dataset)-args.numberOfSpammer
    args.len_max_seq = training_dataset[0][2]

    resultTextbox.insert("end", ("Number of Spammer: " +
                                 str(args.numberOfSpammer.item()) + "\n"))
    resultTextbox.insert("end", ("Number of NonSpammer: " +
                                 str(args.numberOfNonSpammer.item()) + "\n"))
    window.update_idletasks()

    if args.usingWeightRandomSampling:
        resultTextbox.insert("end", "Using WeightRandomSampling... \n")
        sampler = getSampler(training_dataset)
    else:
        resultTextbox.insert("end", "Not Using WeightRandomSampling... \n")
        sampler = None

    window.update_idletasks()

    train_loader = DataLoader(
        training_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False, sampler=sampler)
    valid_loader = DataLoader(
        validation_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)

    if modelSelected.get() == 'SSCL':
        resultTextbox.insert("end", "Using SSCL\n")
        trainModel = SSCL
    elif modelSelected.get() == 'GatedCNN':
        resultTextbox.insert("end", "Using GatedCNN\n")
        trainModel = GatedCNN
    elif modelSelected.get() == 'SelfAttn':
        resultTextbox.insert("end", "Using SelfAttn\n")
        trainModel = SelfAttnModel
    elif (args.using_infoModel and (not args.using_textModel)):
        resultTextbox.insert("end", "Using infoModel Only\n")
        trainModel = None
    else:
        resultTextbox.insert("end", 'No Support For this Model')
        raise ValueError

    window.update_idletasks()

    trainer = TkMultiTaskTrainer(
        trainModel, args, resultTextbox, window).to(device)

    resultTextbox.insert(
        "end", ("Number of Parameters in this Model: " + str(trainer.num_all_params()) + "\n"))
    resultTextbox.insert("end", ("Using device: " + str(device) + "\n"))
    window.update_idletasks()

    args.using_scheduler = args.scheduler_step and args.scheduler_gamma < 1 and args.scheduler_gamma > 0

    if args.using_scheduler:
        scheduler = optim.lr_scheduler.StepLR(
            trainer.optim, args.scheduler_step, gamma=args.scheduler_gamma)

    # trainer.optim.param_groups[0]['lr']=
    allStep = 0
    epoch = 0

    resultTextbox.insert(
        "end", ("Model Structure: \n" + str(trainer.model) + "\n"))

    resultTextbox.see("end")

    window.update_idletasks()

    while epoch < args.n_epoch:
        for i, (text, extra_info, length, label) in enumerate(train_loader):

            trainer.train()
            text, extra_info, length, label = text.to(device), extra_info.to(
                device), length.to(device), label.to(device)

            if args.using_scheduler:
                if trainer.optim.param_groups[0]['lr'] >= args.scheduler_minLr:
                    scheduler.step()

            start_t = time.time()
    #         trainer.train_step((X, X_len), y)
            trainer.train_step(((text, extra_info), None), label)

            end_t = time.time()
            allStep += 1
            resultTextbox.insert("end", ('| Epoch [%d] | Step [%d] | lr [%.6f] | Loss: [%.4f] | Acc: [%.4f] | Time: %.1fs \n' %
                                         (epoch, allStep, trainer.optim.param_groups[0]['lr'], trainer.loss.item(), trainer.accuracy.item(),
                                          end_t - start_t)))
            window.update_idletasks()
            resultTextbox.see("end")

    #         if trainer.accuracy.item() > 0.95: # Stop early
    #             resultTextbox.insert("end", "Train Accuracy Reach the Stop Accuracy ")
    #             raise StopIteration
            if allStep % args.log_freq == 0:

                #################################################

                trainer.plot_train_hist(args.model_name)

                TrainImg = ImageTk.PhotoImage(Image.open(args.log_path+"Train_Loss&Acc_Hist_" + str(
                    args.model_name) + ".png").resize((960, 470), Image.ANTIALIAS))

                training_Canvas.config(image=TrainImg)

                training_Canvas.image = TrainImg

                training_Canvas.grid(column=5, row=0, rowspan=19)

                window.update_idletasks()

            if args.earlyStopStep:
                if allStep >= args.earlyStopStep:
                    resultTextbox.insert("end", "EarlyStopStep Reach")
                    break

            if allStep % args.val_freq == 0:

                for _ in range(args.val_steps):
                    trainer.eval()
                    stIdx = np.random.randint(
                        0, len(validation_dataset) - args.batch_size)
                    v_text, v_extra_info, v_len, v_label = validation_dataset[stIdx: stIdx +
                                                                              args.batch_size]
                    v_text, v_extra_info, v_len, v_label = v_text.to(
                        device), v_extra_info.to(device), v_len.to(device), v_label.to(device)
                    start_t = time.time()
                    trainer.test_step(((v_text, v_extra_info), None), v_label)
                    end_t = time.time()
                    resultTextbox.insert("end", ('| Epoch [%d] | Validation | Step [%d] |  Loss: [%.4f] | Acc: [%.4f] | Time: %.1fs \n' %
                                                 (epoch, allStep, trainer.loss.item(), trainer.accuracy.item(), end_t - start_t)))
                    window.update_idletasks()
                    resultTextbox.see("end")

                trainer.calculateAverage()
                clear_output()
                resultTextbox.insert("end", ("TrainConfusion Matrix: \n"))
                resultTextbox.insert(
                    "end", pd.DataFrame(trainer.cms['Train'][-1]))
                resultTextbox.insert("end", "\n\n")
                resultTextbox.insert("end", ("ValidationConfusion Matrix: \n"))
                resultTextbox.insert(
                    "end", pd.DataFrame(trainer.cms['Val'][-1]))
                resultTextbox.insert("end", "\n\n")
                window.update_idletasks()
                resultTextbox.see("end")

                #################################################

                trainer.plot_all(args.model_name)

                TrainAndValImg = ImageTk.PhotoImage(Image.open(
                    args.log_path + "All_Hist_" + str(args.model_name) + ".png").resize((960, 470), Image.ANTIALIAS))

                trainingAndVal_Canvas.config(image=TrainAndValImg)

                trainingAndVal_Canvas.image = TrainAndValImg

                trainingAndVal_Canvas.grid(column=5, row=20)

                window.update_idletasks()

        epoch += 1
        trainer.model_save(epoch)

        if args.earlyStopEpoch:
            if epoch >= args.earlyStopEpoch:
                resultTextbox.insert("end", "EarlyStopEpoch Reach")
                break

    # Doing test here

    test_loader = DataLoader(
        test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False)
    
    print('Test_dataset size:', len(test_dataset))
    test_accs = []
    test_cms = []
    trainer.eval()
    for i, (test_text, test_extra_info, test_length, test_label) in enumerate(test_loader):
        test_text, test_extra_info, test_length, test_label = test_text.to(
            device), test_extra_info.to(device), test_length.to(device), test_label.to(device)
        test_loss, test_accuracy, test_cm = trainer.test_step(((test_text, test_extra_info), None), test_label)
#         trainer.test_step(((test_text, test_extra_info), test_length), test_label)
        test_accs.append(test_accuracy)
        test_cms.append(test_cm)

    resultTextbox.insert(
        "end", ("\n\n=========================================================================================\n"))
    resultTextbox.insert("end", ("The Test Accuracy: " +
                                 str(torch.mean(torch.tensor(test_accs))) + "\n"))
    resultTextbox.insert("end", ("Test Confusion Matrix: \n"))
    resultTextbox.insert("end", (pd.DataFrame(sum(test_cms))))
    resultTextbox.insert("end", "\n\n")
    resultTextbox.see("end")
    

training_dataset, validation_dataset, test_dataset, tweets_text = loadingTweetsAndUserInfoData(args)


args.numberOfSpammer = sum([t[-1] for t in training_dataset])
args.numberOfNonSpammer = len(training_dataset)-args.numberOfSpammer
args.len_max_seq = training_dataset[0][2]

print("Number of Spammer: ", args.numberOfSpammer)
print("Number of NonSpammer: ", args.numberOfNonSpammer)



if args.usingWeightRandomSampling:
    sampler = getSampler(training_dataset)
else:
    sampler = None

train_loader = DataLoader(
    training_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False, sampler = sampler)
valid_loader = DataLoader(
    validation_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)

trainer = MultiTaskTrainer(SelfAttnModel, args).to(device)

print("Number of Parameters in this Model: ",trainer.num_all_params())
print("Using device: ", device)

scheduler = optim.lr_scheduler.StepLR(trainer.optim, 2000, gamma=0.85)
# trainer.optim.param_groups[0]['lr']=