Python create_checkpoint 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils

메소드/함수: create_checkpoint

hotexamples.com에서의 예제들: 10

Python create_checkpoint - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.create_checkpoint에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def main():
    """
    PyTorch AlexNet implementation.
    """
    global args, best_predc
    args = parser.parse_args()
    # create model
    if args.arch == 'alexnet':
        model = alexnet(pretrained=args.pretrained)
    else:
        raise NotImplementedError

    # use CUDA
    model.cuda()
    
    # define loss and optimizer
    loss = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weightdecay)

    train_dl, val_dl = data_loader(args.data, args.batch_size, args.cuda_workers)
    
    if args.evaluate:
        validate(val_dl, model, loss)
        return

    for epoch in range(args.start_epoch, args.epochs):
        custom_weight_decay(optimizer, epoch, args.lr)

        # train for one epoch
        train(train_dl, model, loss, optimizer, epoch)

        # evaluate on validation set
        prec1, prec5 = validate(val_dl, model, loss)

        # remember the best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

        create_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer': optimizer.state_dict()
        }, is_best, args.arch + '.pth')

예제 #2

파일 보기

def train(config):
    # determine the filename (to be used for saving results, checkpoints, models, etc.)
    filename = Path(config.txt_file).stem

    # Initialize the device which to run the model on
    if config.device == 'cuda':
        if torch.cuda.is_available():
            device = torch.device(config.device)
        else:
            device = torch.device('cpu')
    else:
        device = torch.device(config.device)

    # Initialize the dataset and data loader (note the +1)
    dataset = TextDataset(
        filename=config.txt_file,
        seq_length=config.seq_length
    )
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # get the vocabulary size and int2char and char2int dictionaries for use later
    VOCAB_SIZE = dataset.vocab_size

    # Initialize the model that we are going to use
    model = TextGenerationModel(
        batch_size=config.batch_size,
        seq_length=config.seq_length,
        vocabulary_size=VOCAB_SIZE,
        lstm_num_hidden=config.lstm_num_hidden,
        lstm_num_layers=config.lstm_num_layers,
        device=device,
        batch_first=config.batch_first,
        dropout=1.0-config.dropout_keep_prob
    )

    # Setup the loss and optimizer and learning rate scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
        model.parameters(),
        config.learning_rate
    )

    # Load the latest checkpoint, if any exist
    checkpoints = list(CHECKPOINTS_DIR.glob(f'{model.__class__.__name__}_{filename}_checkpoint_*.pt'))
    if len(checkpoints) > 0:
        # load the latest checkpoint
        checkpoints.sort(key=os.path.getctime)
        latest_checkpoint_path = checkpoints[-1]
        start_step, results, sequences = load_checkpoint(latest_checkpoint_path, model, optimizer)
    else:
         # initialize the epoch, results and best_accuracy
        start_step = 0
        results = {
            'step': [],
            'accuracy': [],
            'loss': [],
        }
        sequences = {
            'step': [],
            't': [],
            'temperature': [],
            'sequence': []
        }

    for step in range(start_step, int(config.train_steps)):
        # reinitialize the data_loader iterater if we have iterated over all available mini-batches
        if step % len(data_loader) == 0 or step == start_step:
            data_iter = iter(data_loader)
        
        # get the mini-batch
        batch_inputs, batch_targets = next(data_iter)

        # Only for time measurement of step through network
        t1 = time.time()

        #######################################################
        # Add more code here ...
        #######################################################

        # put the model in training mode
        model.train()

        # convert the data and send to device
        X = torch.stack(batch_inputs, dim=1)
        X = X.to(device)

        Y = torch.stack(batch_targets, dim=1)
        Y = Y.to(device)

        # forward pass the mini-batch
        Y_out, _ = model.forward(X)
        Y_pred = Y_out.argmax(dim=-1)

        # (re)set the optimizer gradient to 0
        optimizer.zero_grad()

        # compute the accuracy and the loss
        accuracy = get_accuracy(Y_pred, Y)
        loss = criterion.forward(Y_out.transpose(2, 1), Y)

        # backwards propogate the loss
        loss.backward()

        # clip the gradients (to preven them from exploding)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm)

        # tune the model parameters
        optimizer.step()

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size/float(t2-t1)

        if step % config.print_every == 0:
            print(f'[{datetime.now().strftime("%Y-%m-%d %H:%M")}], Train Step {step:04d}/{int(config.train_steps):04d}, Batch Size = {config.batch_size}, Examples/Sec = {examples_per_second:.2f}, Accuracy = {accuracy:.2f}, Loss = {loss:.3f}')

            # append the accuracy and loss to the results
            results['step'].append(step)
            results['accuracy'].append(accuracy.item())
            results['loss'].append(loss.item())

        if step % config.sample_every == 0:
            for T in [20, 30, 60, 120]:
                for temperature in [0.0, 0.5, 1.0, 2.0]:
                    # Generate some sentences by sampling from the model
                    sequence = sample_sequence(
                        model=model,
                        vocab_size=VOCAB_SIZE,
                        T=T,
                        char=None,
                        temperature=temperature,
                        device=device
                    )
                    sequence_str = dataset.convert_to_string(sequence)
                    print(f'Generated sample sequence (T={T}, temp={temperature}): {sequence_str}')

                    # append the generated sequence to the sequences
                    sequences['step'].append(step)
                    sequences['t'].append(T)
                    sequences['temperature'].append(temperature)
                    sequences['sequence'].append(sequence_str)

        if step % config.checkpoint_every == 0:
            # create a checkpoint
            create_checkpoint(CHECKPOINTS_DIR, filename, step, model, optimizer, results, sequences)

            # save the results
            save_results(RESULTS_DIR, filename, results, sequences, model)

            # save the model
            save_model(MODELS_DIR, filename, model)

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')

예제 #3

파일 보기

            target_caption = nn.utils.rnn.pack_padded_sequence(
                caption, caption_len, batch_first=True)[0]
            optimizer.zero_grad()
            cnn_feature = cnn(image)
            rnn_tokenized_sentence, alphas = rnn(cnn_feature, caption,
                                                 caption_len)
            loss = loss_fn(rnn_tokenized_sentence, target_caption)
            loss += params['alpha_c'] * (
                (1. - alphas.sum(dim=1))**
                2).mean()  # Doubly Stochastic Attention
            train_loss.append(loss.data.item())
            loss.backward()
            optimizer.step()
            #*print('One batch completed');
            if (idx + 1) % 5000 == 0:
                create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1,
                                  train_loss, params)
            if (idx + 1) % 500 == 0 or (idx + 1) == len(train_data_loader):
                print("Epoch %d (Step %d) - %0.4f train loss, %0.2f time." %
                      (epoch + 1, idx + 1, loss, time.time() - start_time))
            #*if idx == 2: break

        #*break
        print("Epoch %d - %0.4f loss, %.2f time. " %
              (epoch + 1, np.mean(train_loss), time.time() - start_time))
        create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1, train_loss,
                          params)

    print('Training completed.')

import pickle

예제 #4

파일 보기

파일: generate_clean.py 프로젝트: Varuzhan97/Dataset-Generation-for-DeepSpeech-Speech-To-Text-Engine

def generate_clean_db(language, batch_size, sleep_time, parameters):
    lines = []
    csv_data = []
    clips_output_path = ''
    index = 0
    if parameters.checkpoint_path is not None:
        if not os.path.isdir(parameters.checkpoint_path):
            print('Checkpoint Path Not Exist.')
            sys.exit()
        print('Loading Checkpoint.')
        clips_output_path = os.path.join(parameters.checkpoint_path, 'clips')
        index, lines, csv_data = utils.load_checkpoint(
            parameters.checkpoint_path)
    else:
        if (not os.path.isdir(parameters.input_path)) and (not os.path.isfile(
                parameters.input_path)):
            print('Input Text File/Folder Not Exist.')
            sys.exit()
        if not os.path.isdir(parameters.output_path):
            print('Output Path Not Exist.')
            sys.exit()
        now = datetime.now().strftime('%Y-%m-%d*%H-%M-%S')
        clips_output_path = os.path.join(parameters.output_path,
                                         'corpus-' + now, 'clips')
        os.makedirs(clips_output_path)
        all_files = []
        if os.path.isdir(parameters.input_path):
            print('Input Is A Directory.')
            for file in os.listdir(parameters.input_path):
                if file.endswith(".txt"):
                    all_files.append(os.path.join(parameters.input_path, file))
        elif os.path.isfile(parameters.input_path):
            print('Input Is A Text File.')
            all_files.append(parameters.input_path)
        for file in all_files:
            current_file = open(file, 'r')
            for line in current_file:
                lines.append(line.strip())
        utils.create_checkpoint(
            os.path.split(clips_output_path)[0], lines, all_files)
    print('Converting Text To Speech And Generating Dataset.')
    print('Dataset Path: %s.' % os.path.split(clips_output_path)[0])
    for line in lines[index:]:
        pair = []
        current_file, current_file_slow = g_tts(line, language,
                                                clips_output_path, index,
                                                parameters.generate_male)
        current_file_size = os.path.getsize(current_file)
        current_file_size_slow = os.path.getsize(current_file_slow)
        print('Processing Item: %d/%d, Type: Normal.' %
              (index + 1, len(lines)))
        is_valid, file, size = utils.check_audio(
            line,
            current_file,
        )
        if is_valid == True:
            csv_data.append(
                [os.path.basename(file),
                 str(size),
                 line.strip().lower()])
            pair.append(csv_data[-1])
        else:
            print(
                'Duration Of %s File Is Not Valid (Must Be Between 5-20 Seconds.)'
                % os.path.basename(current_file))
            os.remove(file)
        print('Processing Item: %d/%d, Type: Slow.' % (index + 1, len(lines)))
        is_valid, file, size = utils.check_audio(line, current_file_slow)
        if is_valid == True:
            csv_data.append(
                [os.path.basename(file),
                 str(size),
                 line.strip().lower()])
            pair.append(csv_data[-1])
        else:
            print(
                'Duration Of %s File Is Not Valid (Must Be Between 0.5-20 Seconds.)'
                % os.path.basename(current_file_slow))
            os.remove(file)
        utils.save_checkpoint(os.path.split(clips_output_path)[0], pair, index)
        index += 1
        if index % batch_size == 0:
            print('Sleep (%d Seconds).' % sleep_time)
            time.sleep(sleep_time)
    return csv_data, clips_output_path

예제 #5

파일 보기

def train():
    model_type = FLAGS.model_type
    run_desc = FLAGS.run_desc
    run_desc_tl = FLAGS.run_desc_tl
    data_dir = Path(FLAGS.data_dir)
    checkpoints_dir = Path(FLAGS.checkpoints_dir) / model_type / run_desc
    models_dir = Path(FLAGS.models_dir) / model_type / run_desc
    results_dir = Path(FLAGS.results_dir) / model_type / run_desc
    checkpoints_dir_tl = Path(FLAGS.checkpoints_dir) / model_type / run_desc_tl
    models_dir_tl = Path(FLAGS.models_dir) / model_type / run_desc_tl
    results_dir_tl = Path(FLAGS.results_dir) / model_type / run_desc_tl
    learning_rate = FLAGS.learning_rate
    batch_size_fn = FLAGS.batch_size
    epoch_no = FLAGS.epoch
    sent_hidden_dim = FLAGS.sent_hidden_dim
    doc_hidden_dim = FLAGS.doc_hidden_dim

    if not data_dir.exists():
        raise ValueError('Data directory does not exist')

    # create other directories if they do not exist
    create_directories(checkpoints_dir_tl, models_dir_tl, results_dir_tl)

    # load the data
    print('Loading the data...')

    # get the glove and elmo embedding
    glove_dim = 0
    elmo_dim = 0
    GloVe_vectors = None
    ELMo = None
    if 'glove' in model_type:
        GloVe_vectors = GloVe()
        glove_dim = WORD_EMBED_DIM
        print('Uploaded GloVe embeddings.')
    if 'elmo' in model_type:
        ELMo = Elmo(options_file=ELMO_OPTIONS_FILE,
                    weight_file=ELMO_WEIGHT_FILE,
                    num_output_representations=1,
                    requires_grad=False,
                    dropout=0).to(DEVICE)
        elmo_dim = ELMO_EMBED_DIM
        print('Uploaded Elmo embeddings.')
    input_dim = glove_dim + elmo_dim
    # get the fnn and snli data
    keys = ['train', 'test', 'val']
    FNN_DL_small = {}
    for i in keys:
        FNN_temp = FNNDataset(data_dir / ('FNN_small_' + i + '.pkl'),
                              GloVe_vectors, ELMo)
        FNN_DL_temp = data.DataLoader(dataset=FNN_temp,
                                      batch_size=batch_size_fn,
                                      num_workers=0,
                                      shuffle=True,
                                      drop_last=True,
                                      collate_fn=PadSortBatchFNN())
        FNN_DL_small[i] = FNN_DL_temp
    print('Uploaded FNN data.')

    # initialize the model, according to the model type
    print('Initializing the model for transfer learning...', end=' ')

    model = HierarchicalAttentionNet(input_dim=input_dim,
                                     sent_hidden_dim=sent_hidden_dim,
                                     doc_hidden_dim=doc_hidden_dim,
                                     num_classes=NUM_CLASSES_FN,
                                     dropout=0).to(DEVICE)
    print('Done!')
    print_model_parameters(model)
    print()
    print('Working on: ', end='')
    print(DEVICE)

    # set the criterion and optimizer
    # we weigh the loss: class [0] is real, class [1] is fake
    #
    loss_func_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

    # load the last checkpoint (if it exists)
    results = {
        'epoch': [],
        'train_loss': [],
        'train_accuracy': [],
        'val_loss': [],
        'val_accuracy': []
    }
    if epoch_no == '0':
        model_path = models_dir / Path('HierarchicalAttentionNet_model.pt')
        _, _, _ = load_latest_checkpoint(model_path, model, optimizer)
    else:
        checkpoint_path = checkpoints_dir / Path(
            'HierarchicalAttentionNet_Adam_checkpoint_' + str(epoch_no) +
            '_.pt')
        _, _, _ = load_checkpoint(checkpoint_path, model, optimizer)
    print(f'Starting transfer learning on the model extracted from {epoch_no}')
    epoch = 0
    for i in range(epoch, MAX_EPOCHS):
        print(f'Epoch {i+1:0{len(str(MAX_EPOCHS))}}/{MAX_EPOCHS}:')
        model.train()
        # one epoch of training
        train_loss_fn, train_acc_fn = train_epoch_fn(FNN_DL_small['train'],
                                                     model, optimizer,
                                                     loss_func_fn)

        # one epoch of eval
        model.eval()
        val_loss_fn, val_acc_fn = eval_epoch_fn(FNN_DL_small['val'], model,
                                                loss_func_fn)

        results['epoch'].append(i)
        results['train_loss'].append(train_loss_fn)
        results['train_accuracy'].append(train_acc_fn)
        results['val_loss'].append(val_loss_fn)
        results['val_accuracy'].append(val_acc_fn)
        #print(results)
        best_accuracy = torch.tensor(val_acc_fn).max().item()
        create_checkpoint(checkpoints_dir_tl, i, model, optimizer, results,
                          best_accuracy)

    # save and plot the results
    save_results(results_dir_tl, results, model)
    save_model(models_dir_tl, model)

예제 #6

파일 보기

def map_fn(index=None, flags=None):
    torch.set_default_tensor_type('torch.FloatTensor')
    torch.manual_seed(1234)

    train_data = dataset.DATA(config.TRAIN_DIR)

    if config.MULTI_CORE:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_data,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)
    else:
        train_sampler = torch.utils.data.RandomSampler(train_data)

    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=flags['batch_size']
        if config.MULTI_CORE else config.BATCH_SIZE,
        sampler=train_sampler,
        num_workers=flags['num_workers'] if config.MULTI_CORE else 4,
        drop_last=True,
        pin_memory=True)

    if config.MULTI_CORE:
        DEVICE = xm.xla_device()
    else:
        DEVICE = config.DEVICE

    netG = model.colorization_model().double()
    netD = model.discriminator_model().double()

    VGG_modelF = torchvision.models.vgg16(pretrained=True).double()
    VGG_modelF.requires_grad_(False)

    netG = netG.to(DEVICE)
    netD = netD.to(DEVICE)

    VGG_modelF = VGG_modelF.to(DEVICE)

    optD = torch.optim.Adam(netD.parameters(), lr=2e-4, betas=(0.5, 0.999))
    optG = torch.optim.Adam(netG.parameters(), lr=2e-4, betas=(0.5, 0.999))

    ## Trains
    train_start = time.time()
    losses = {
        'G_losses': [],
        'D_losses': [],
        'EPOCH_G_losses': [],
        'EPOCH_D_losses': [],
        'G_losses_eval': []
    }

    netG, optG, netD, optD, epoch_checkpoint = utils.load_checkpoint(
        config.CHECKPOINT_DIR, netG, optG, netD, optD, DEVICE)
    netGAN = model.GAN(netG, netD)
    for epoch in range(
            epoch_checkpoint, flags['num_epochs'] +
            1 if config.MULTI_CORE else config.NUM_EPOCHS + 1):
        print('\n')
        print('#' * 8, f'EPOCH-{epoch}', '#' * 8)
        losses['EPOCH_G_losses'] = []
        losses['EPOCH_D_losses'] = []
        if config.MULTI_CORE:
            para_train_loader = pl.ParallelLoader(
                train_loader, [DEVICE]).per_device_loader(DEVICE)
            engine.train(para_train_loader,
                         netGAN,
                         netD,
                         VGG_modelF,
                         optG,
                         optD,
                         device=DEVICE,
                         losses=losses)
            elapsed_train_time = time.time() - train_start
            print("Process", index, "finished training. Train time was:",
                  elapsed_train_time)
        else:
            engine.train(train_loader,
                         netGAN,
                         netD,
                         VGG_modelF,
                         optG,
                         optD,
                         device=DEVICE,
                         losses=losses)
        #########################CHECKPOINTING#################################
        utils.create_checkpoint(epoch,
                                netG,
                                optG,
                                netD,
                                optD,
                                max_checkpoint=config.KEEP_CKPT,
                                save_path=config.CHECKPOINT_DIR)
        ########################################################################
        utils.plot_some(train_data, netG, DEVICE, epoch)
        gc.collect()

예제 #7

파일 보기

파일: train_HAN.py 프로젝트: naumix/Multi-modal_Fake_News_Detection_using_GCN_and_LM

def train():
    model_type = FLAGS.model_type
    run_desc = FLAGS.run_desc
    data_dir = Path(FLAGS.data_dir)
    checkpoints_dir = Path(FLAGS.checkpoints_dir) / model_type / run_desc
    models_dir = Path(FLAGS.models_dir) / model_type / run_desc
    results_dir = Path(FLAGS.results_dir) / model_type / run_desc
    learning_rate = LEARNING_RATE
    sent_hidden_dim = FLAGS.sent_hidden_dim
    doc_hidden_dim = FLAGS.doc_hidden_dim

    if not data_dir.exists():
        raise ValueError('Data directory does not exist')

    # create other directories if they do not exist
    create_directories(checkpoints_dir, models_dir, results_dir)

    # load the data
    print('Loading the data...')

    # get the glove and elmo embedding
    glove_dim = 0
    elmo_dim = 0
    GloVe_vectors = None
    ELMo = None
    if 'glove' in model_type:
        GloVe_vectors = GloVe()
        glove_dim = WORD_EMBED_DIM
        print('Uploaded GloVe embeddings.')
    if 'elmo' in model_type:
        ELMo = Elmo(options_file=ELMO_OPTIONS_FILE,
                    weight_file=ELMO_WEIGHT_FILE,
                    num_output_representations=1,
                    requires_grad=False,
                    dropout=0).to(DEVICE)
        elmo_dim = ELMO_EMBED_DIM
        print('Uploaded Elmo embeddings.')
    input_dim = glove_dim + elmo_dim
    # get the fnn and snli data
    FNN = {}
    FNN_DL = {}

    for path in ['train', 'val', 'test']:
        FNN[path] = FNNDataset(data_dir / ('FNN_' + path + '.pkl'),
                               GloVe_vectors, ELMo)
        FNN_DL[path] = data.DataLoader(dataset=FNN[path],
                                       batch_size=BATCH_SIZE_FN,
                                       num_workers=0,
                                       shuffle=True,
                                       drop_last=True,
                                       collate_fn=PadSortBatchFNN())
    print('Uploaded FNN data.')

    fnn_train_sent_no = get_number_sentences(data_dir / 'FNN_train.pkl')
    fnn_train_len = len(FNN['train'])

    # initialize the model, according to the model type
    print('Initializing the model...', end=' ')

    model = HierarchicalAttentionNet(input_dim=input_dim,
                                     sent_hidden_dim=sent_hidden_dim,
                                     doc_hidden_dim=doc_hidden_dim,
                                     num_classes=NUM_CLASSES_FN,
                                     dropout=0).to(DEVICE)
    print('Working on: ', end='')
    print(DEVICE)
    print('Done!')
    print_model_parameters(model)
    print()

    # set the criterion and optimizer
    # we weigh the loss: class [0] is real, class [1] is fake
    #
    real_ratio, fake_ratio = get_class_balance(data_dir / 'FNN_train.pkl')
    weights = [(1.0 - real_ratio), (1.0 - fake_ratio)]
    print(weights)
    class_weights = torch.FloatTensor(weights).to(DEVICE)
    loss_func_fn = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

    # load the last checkpoint (if it exists)
    results = {
        'epoch': [],
        'train_loss': [],
        'train_accuracy': [],
        'val_loss': [],
        'val_accuracy': []
    }
    epoch, results, best_accuracy = load_latest_checkpoint(
        checkpoints_dir, model, optimizer)
    if epoch == 0:
        print(f'Starting training at epoch {epoch + 1}...')
    else:
        print(f'Resuming training from epoch {epoch + 1}...')

    for i in range(epoch, MAX_EPOCHS):
        print(f'Epoch {i+1:0{len(str(MAX_EPOCHS))}}/{MAX_EPOCHS}:')
        model.train()
        # one epoch of training
        train_loss_fn, train_acc_fn = train_epoch_fn(FNN_DL['train'], model,
                                                     optimizer, loss_func_fn)

        # one epoch of eval
        model.eval()
        val_loss_fn, val_acc_fn = eval_epoch_fn(FNN_DL['val'], model,
                                                loss_func_fn)

        results['epoch'].append(i)
        results['train_loss'].append(train_loss_fn)
        results['train_accuracy'].append(train_acc_fn)
        results['val_loss'].append(val_loss_fn)
        results['val_accuracy'].append(val_acc_fn)
        #print(results)
        best_accuracy = torch.tensor(val_acc_fn).max().item()
        create_checkpoint(checkpoints_dir, i, model, optimizer, results,
                          best_accuracy)
        if (i + 1) % 4 == 0 and i != 0:
            learning_rate = learning_rate / 2
            optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

    # save and plot the results
    save_results(results_dir, results, model)
    save_model(models_dir, model)

예제 #8

파일 보기

파일: train.py 프로젝트: sgraaf/JTL_NLI_and_Fake_News_Detection

def train():
    model_type = FLAGS.model_type
    run_desc = FLAGS.run_desc
    data_dir = Path(FLAGS.data_dir)
    checkpoints_dir = Path(FLAGS.checkpoints_dir) / model_type / run_desc
    models_dir = Path(FLAGS.models_dir) / model_type / run_desc
    results_dir = Path(FLAGS.results_dir) / model_type / run_desc
    #data_percentage = FLAGS.data_percentage

    if model_type == 'STL':
        only_fn = True
    else:
        only_fn = False

    # check if data directory exists
    if not data_dir.exists():
        raise ValueError('Data directory does not exist')

    # create other directories if they do not exist
    create_directories(checkpoints_dir, models_dir, results_dir)

    # load the data
    print('Loading the data...')

    # get the glove and elmo embeddings
    GloVe_vectors = GloVe()
    print('Uploaded GloVe embeddings.')
    # ELMo = Elmo(
    #         options_file=ELMO_OPTIONS_FILE,
    #         weight_file=ELMO_WEIGHT_FILE,
    #         num_output_representations=1,
    #         requires_grad=False,
    #         dropout=0).to(DEVICE)
    # print('Uploaded Elmo embeddings.')
    # get the fnn and snli data
    FNN = {}
    FNN_DL = {}

    for path in ['train', 'val', 'test']:
        FNN[path] = FNNDataset(data_dir / ('FNN_' + path + '.pkl'),
                               GloVe_vectors)
        FNN_DL[path] = data.DataLoader(dataset=FNN[path],
                                       batch_size=BATCH_SIZE_FN,
                                       num_workers=0,
                                       shuffle=True,
                                       drop_last=True,
                                       collate_fn=PadSortBatch())
    print('Uploaded FNN data.')
    if not only_fn:
        SNLI = {}
        SNLI_DL = {}
        for path in ['train', 'val', 'test']:
            SNLI[path] = SNLIDataset(data_dir / ('SNLI_' + path + '.pkl'),
                                     GloVe_vectors)
            SNLI_DL[path] = data.DataLoader(dataset=SNLI[path],
                                            batch_size=BATCH_SIZE_NLI,
                                            num_workers=0,
                                            shuffle=True,
                                            drop_last=True,
                                            collate_fn=PadSortBatchSNLI())
        print('Uploaded SNLI data.')
        snli_train_sent_no = len(SNLI['train']) * 2
        snli_train_len = len(SNLI['train'])
    fnn_train_sent_no = get_number_sentences(data_dir / 'FNN_train.pkl')
    fnn_train_len = len(FNN['train'])

    # initialize the model, according to the model type
    print('Initializing the model...', end=' ')
    if model_type == 'MTL':
        NUM_CLASSES_NLI = 3
        print("Loading an MTL HAN model.")
    elif model_type == 'STL':
        NUM_CLASSES_NLI = None
        print("Loading an STL HAN model.")
    elif model_type == 'Transfer':
        print("Nothing for now.")
    if ELMO_EMBED_DIM is not None:
        # input_dim = WORD_EMBED_DIM + ELMO_EMBED_DIM
        input_dim = WORD_EMBED_DIM
    else:
        input_dim = WORD_EMBED_DIM
    model = HierarchicalAttentionNet(input_dim=input_dim,
                                     hidden_dim=WORD_HIDDEN_DIM,
                                     num_classes_task_fn=NUM_CLASSES_FN,
                                     embedding=None,
                                     num_classes_task_nli=NUM_CLASSES_NLI,
                                     dropout=0).to(DEVICE)
    print('Working on: ', end='')
    print(DEVICE)
    print('Done!')
    print_model_parameters(model)
    print()

    # set the criterion and optimizer
    # we weigh the loss: class [0] is real, class [1] is fake
    #
    real_ratio, fake_ratio = get_class_balance(data_dir / 'FNN_train.pkl')
    weights = [(1.0 - real_ratio), (1.0 - fake_ratio)]
    print(weights)
    class_weights = torch.FloatTensor(weights).to(DEVICE)
    loss_func_fn = nn.CrossEntropyLoss(weight=class_weights)
    if not only_fn:
        loss_func_nli = nn.CrossEntropyLoss()
        temperature = 2
    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

    # load the last checkpoint (if it exists)
    epoch, results, best_accuracy = load_latest_checkpoint(
        checkpoints_dir, model, optimizer)
    results_fn = {
        'epoch': [],
        'train_loss': [],
        'train_accuracy': [],
        'val_loss': [],
        'val_accuracy': []
    }
    results_nli = {
        'epoch': [],
        'train_loss': [],
        'train_accuracy': [],
        'val_loss': [],
        'val_accuracy': []
    }
    results = {'fn': results_fn, 'nli': results_nli}
    if epoch == 0:
        print(f'Starting training at epoch {epoch + 1}...')
    else:
        print(f'Resuming training from epoch {epoch + 1}...')

    for i in range(epoch, MAX_EPOCHS):
        print(f'Epoch {i+1:0{len(str(MAX_EPOCHS))}}/{MAX_EPOCHS}:')
        model.train()
        # one epoch of training
        if only_fn:
            train_loss_fn, train_acc_fn = train_epoch_fn(
                FNN_DL['train'], model, optimizer, loss_func_fn)
        elif model_type == 'MTL':
            model.train()

            train_loss_fn = []
            train_acc_fn = []
            loss_fn_weight_gradnorm = 1

            train_loss_nli = []
            train_acc_nli = []
            loss_nli_weight_gradnorm = 1

            #define by sentence number
            #loss_fn_weight_dataset = 1 - fnn_train_sent_no / (fnn_train_sent_no + snli_train_sent_no)
            #loss_nli_weight_dataset = 1 - snli_train_sent_no / (fnn_train_sent_no + snli_train_sent_no)
            loss_fn_weight_dataset = 1 - fnn_train_len / (fnn_train_len +
                                                          snli_train_len)
            loss_nli_weight_dataset = 1 - snli_train_len / (fnn_train_len +
                                                            snli_train_len)

            chance_fn = 1000 * (fnn_train_len / BATCH_SIZE_FN) / (
                (fnn_train_len / BATCH_SIZE_FN) +
                (snli_train_len / BATCH_SIZE_NLI))
            iterator_fnn = enumerate(FNN_DL['train'])
            iterator_snli = enumerate(SNLI_DL['train'])
            done_fnn, done_snli = False, False
            step_fnn = 0
            step_snli = 0
            print(
                f'Train set length, FNN: {fnn_train_len}. Train set length, SNLI: {snli_train_len}.'
            )
            print(
                f'Training set to batch size ratio for Fake News Detection is {fnn_train_len / BATCH_SIZE_FN}.'
            )
            print(
                f'Training set to batch size ratio for Language Inference is {snli_train_len / BATCH_SIZE_NLI}.'
            )

            while not (done_fnn and done_snli):
                if len(train_loss_fn) > 1 and len(train_loss_nli) > 1:
                    # computes loss weights based on the loss from the previous iterations
                    loss_fn_ratio = train_loss_fn[len(train_loss_fn) -
                                                  1] / train_loss_fn[
                                                      len(train_loss_fn) - 2]
                    loss_nli_ratio = train_loss_nli[
                        len(train_acc_nli) -
                        1] / train_loss_nli[len(train_loss_nli) - 2]
                    loss_fn_exp = math.exp(loss_fn_ratio / temperature)
                    loss_nli_exp = math.exp(loss_nli_ratio / temperature)
                    loss_fn_weight_gradnorm = loss_fn_exp / (loss_fn_exp +
                                                             loss_nli_exp)
                    loss_nli_weight_gradnorm = loss_nli_exp / (loss_fn_exp +
                                                               loss_nli_exp)
                    loss_fn_weight = math.exp(
                        loss_fn_weight_dataset * loss_fn_weight_gradnorm) / (
                            math.exp(loss_fn_weight_dataset *
                                     loss_fn_weight_gradnorm) +
                            math.exp(loss_nli_weight_dataset *
                                     loss_nli_weight_gradnorm))
                    loss_nli_weight = math.exp(
                        loss_nli_weight_dataset * loss_nli_weight_gradnorm) / (
                            math.exp(loss_fn_weight_dataset *
                                     loss_fn_weight_gradnorm) +
                            math.exp(loss_nli_weight_dataset *
                                     loss_nli_weight_gradnorm))
                else:
                    loss_fn_weight = loss_fn_weight_dataset
                    loss_nli_weight = loss_nli_weight_dataset

                # define the total loss function
                #loss_func = loss_func_fn + loss_func_nli
                # is this needed?

                if np.random.randint(0, 1000) < chance_fn:
                    try:
                        step_fnn, batch_fnn = next(iterator_fnn)
                    except StopIteration:
                        done_fnn = True
                    else:
                        try:
                            batch_loss_fn, batch_acc_fn = train_batch_fn(
                                batch_fnn, model, optimizer, loss_func_fn,
                                loss_fn_weight)
                            train_loss_fn.append(batch_loss_fn)
                            train_acc_fn.append(batch_acc_fn)
                        except:
                            print('Error in batch')
                else:
                    try:
                        step_snli, batch_snli = next(iterator_snli)
                    except StopIteration:
                        done_snli = True
                    else:
                        try:
                            batch_loss_nli, batch_acc_nli = train_batch_nli(
                                batch_snli, model, optimizer, loss_func_nli,
                                loss_nli_weight)
                            train_loss_nli.append(batch_loss_nli)
                            train_acc_nli.append(batch_acc_nli)
                        except:
                            print('Error in batch')
                print(f'FNN batch {step_fnn}')
                print(f'SNLI batch {step_snli}')
                if step_fnn % 50 == 0 and step_fnn != 0:
                    print(f'Processed {step_fnn} FNN batches.')
                    print(f'Accuracy: {train_acc_fn[len(train_acc_fn)-1]}.')
                    print(
                        f'Weight for loss for NLI is {loss_nli_weight}, for loss for FN is {loss_fn_weight}.'
                    )
                if step_snli % 50 == 0 and step_snli != 0:
                    print(f'Processed {step_snli} SNLIbatches.')
                    print(f'Accuracy: {train_acc_nli[len(train_acc_nli)-1]}.')
                    print(
                        f'Weight for loss for NLI is {loss_nli_weight}, for loss for FN is {loss_fn_weight}.'
                    )
        # one epoch of eval
        model.eval()
        val_loss_fn, val_acc_fn = eval_epoch_fn(FNN_DL['val'], model,
                                                loss_func_fn)
        tasks = ['fn']
        if model_type == 'MTL':
            val_loss_nli, val_acc_nli = eval_epoch_nli(SNLI_DL['val'], model,
                                                       loss_func_nli)
            tasks.append('nli')

        for task in tasks:
            results[task]['epoch'].append(i)
            if task == 'fn':
                temp_train_loss = train_loss_fn
                temp_val_loss = val_loss_fn
                temp_train_acc = train_acc_fn
                temp_val_acc = val_acc_fn
            elif task == 'nli':
                temp_train_loss = train_loss_nli
                temp_val_loss = val_loss_nli
                temp_train_acc = train_acc_nli
                temp_val_acc = val_acc_nli

            results[task]['train_loss'].append(temp_train_loss)
            results[task]['train_accuracy'].append(temp_train_acc)
            results[task]['val_loss'].append(temp_val_loss)
            results[task]['val_accuracy'].append(temp_val_acc)
            print(results)

        best_accuracy = torch.tensor(temp_val_acc).max().item()
        create_checkpoint(checkpoints_dir, epoch, model, optimizer, results,
                          best_accuracy)

    # save and plot the results
    save_results(results_dir, results, model)
    save_model(models_dir, model)
    plot_results(results_dir, results, model)

예제 #9

파일 보기

def train():
    data_dir = Path(FLAGS.data_dir)
    checkpoints_dir = Path(FLAGS.checkpoints_dir)
    models_dir = Path(FLAGS.models_dir)
    results_dir = Path(FLAGS.results_dir)

    if not data_dir.exists():
        raise ValueError('Data directory does not exist')
    
    # create other directories if they do not exist
    create_directories(checkpoints_dir, models_dir, results_dir)
    
    # load the data
    print('Loading the data...')
    adj_file = data_dir / 'adj_matrix.npz'
    features_file = data_dir / 'features_matrix.pkl'
    labels_file = data_dir / 'labels_matrix.pkl'
    splits_file = data_dir / 'splits_dict.pkl'
    adj, features, labels, splits_dict = load_data(adj_file, features_file, labels_file, splits_file)
    train_idxs = splits_dict['train']
    val_idxs = splits_dict['val']
    test_idxs = splits_dict['test']

    # initialize the model, according to the model type
    print('Initializing the model...')
    model = GraphConvolutionalNetwork(
            input_dim=features.shape[1], 
            hidden_dim=HIDDEN_DIM, 
            num_classes=labels.max().item() + 1,  
            dropout=DROPOUT
    ).to(DEVICE)
    # print_model_parameters(model)

    # set the criterion and optimizer
    print('Initializing the criterion and optimizer')
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(
        params=model.parameters(),
        lr=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY
    )        

    # initialize the results dict
    results = {
            'epoch': [],
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': []
    }    

    print(f'Starting training at epoch 1...')
    for i in range(0, MAX_EPOCHS):
        st = time()
        
        # train
        model.train()
        optimizer.zero_grad()
        
        # forward pass
        output = model(features, adj)
        
        # compute the training loss and accuracy
        train_targets = labels[train_idxs].max(dim=1).indices
        train_loss = criterion(output[train_idxs], train_targets)
        train_acc = accuracy(output[train_idxs], train_targets)
        
        # backpropogate the loss
        train_loss.backward()
        optimizer.step()
        
        # evaluate
        model.eval()
        output = model(features, adj)
        val_targets = labels[val_idxs].max(dim=1).indices
        val_loss = criterion(output[val_idxs], val_targets)
        val_acc = accuracy(output[val_idxs], val_targets)
                
        # record results
        results['epoch'].append(i)
        results['train_loss'].append(train_loss.item())        
        results['train_acc'].append(train_acc.item())
        results['val_loss'].append(val_loss.item())
        results['val_acc'].append(val_acc.item())
        
        # print update
        print(f'Epoch: {i+1:02d} Train loss: {train_loss.item():0.4f} Train acc: {train_acc:0.4f} Val loss: {val_loss.item():0.4f} Val acc: {val_acc:0.4f} done in {time() - st} s')

        # create a checkpoint
        create_checkpoint(checkpoints_dir, i, model, optimizer, results)

    # test
    model.eval()
    output = model(features, adj)
    test_targets = labels[test_idxs].max(dim=1).indices
    test_loss = criterion(output[test_idxs], test_targets)
    test_acc = accuracy(output[test_idxs], test_targets)

    # record results
    results['test_loss'] = test_loss.item()
    results['test_acc'] = test_acc.item()
    
    # save the model and results
    save_model(models_dir, model)
    save_results(results_dir, results, model)

예제 #10

파일 보기

파일: train.py 프로젝트: wyli/tutorials

def main(cfg):

    os.makedirs(str(cfg.output_dir + f"/fold{cfg.fold}/"), exist_ok=True)

    # set random seed, works when use all data to train
    if cfg.seed < 0:
        cfg.seed = np.random.randint(1_000_000)
    set_seed(cfg.seed)

    # set dataset, dataloader
    train = pd.read_csv(cfg.train_df)

    if cfg.fold == -1:
        val_df = train[train["fold"] == 0]
    else:
        val_df = train[train["fold"] == cfg.fold]
    train_df = train[train["fold"] != cfg.fold]

    train_dataset = get_train_dataset(train_df, cfg)
    val_dataset = get_val_dataset(val_df, cfg)

    train_dataloader = get_train_dataloader(train_dataset, cfg)
    val_dataloader = get_val_dataloader(val_dataset, cfg)

    if cfg.train_val is True:
        train_val_dataset = get_val_dataset(train_df, cfg)
        train_val_dataloader = get_val_dataloader(train_val_dataset, cfg)

    to_device_transform = ToDeviced(keys=("input", "target", "mask",
                                          "is_annotated"),
                                    device=cfg.device)
    cfg.to_device_transform = to_device_transform
    # set model

    model = RanzcrNet(cfg)
    model.to(cfg.device)

    # set optimizer, lr scheduler
    total_steps = len(train_dataset)

    optimizer = get_optimizer(model, cfg)
    scheduler = get_scheduler(cfg, optimizer, total_steps)

    # set other tools
    if cfg.mixed_precision:
        scaler = GradScaler()
    else:
        scaler = None

    writer = SummaryWriter(str(cfg.output_dir + f"/fold{cfg.fold}/"))

    # train and val loop
    step = 0
    i = 0
    best_val_loss = np.inf
    optimizer.zero_grad()
    for epoch in range(cfg.epochs):
        print("EPOCH:", epoch)
        gc.collect()
        if cfg.train is True:
            run_train(
                model=model,
                train_dataloader=train_dataloader,
                optimizer=optimizer,
                scheduler=scheduler,
                cfg=cfg,
                scaler=scaler,
                writer=writer,
                epoch=epoch,
                iteration=i,
                step=step,
            )

        if (epoch + 1) % cfg.eval_epochs == 0 or (epoch + 1) == cfg.epochs:
            val_loss = run_eval(
                model=model,
                val_dataloader=val_dataloader,
                cfg=cfg,
                writer=writer,
                epoch=epoch,
            )

        if cfg.train_val is True:
            if (epoch + 1) % cfg.eval_train_epochs == 0 or (epoch +
                                                            1) == cfg.epochs:
                train_val_loss = run_eval(model, train_val_dataloader, cfg,
                                          writer, epoch)
                print(f"train_val_loss {train_val_loss:.5}")

        if val_loss < best_val_loss:
            print(
                f"SAVING CHECKPOINT: val_loss {best_val_loss:.5} -> {val_loss:.5}"
            )
            best_val_loss = val_loss

            checkpoint = create_checkpoint(
                model,
                optimizer,
                epoch,
                scheduler=scheduler,
                scaler=scaler,
            )
            torch.save(
                checkpoint,
                f"{cfg.output_dir}/fold{cfg.fold}/checkpoint_best_seed{cfg.seed}.pth",
            )