Exemplo n.º 1
0
def main(model_name,
         use_cuda,
         batch_size,
         teacher_forcing_schedule,
         keep_prob,
         val_size,
         lr,
         decoder_type,
         vocab_limit,
         hidden_size,
         embedding_size,
         max_length,
         train_data,
         test_data,
         device,
         seed=42):

    # TODO: Change logging to reflect loaded parameters
    model_path = './model/' + model_name
    print("training %s with use_cuda=%s, batch_size=%i" %
          (model_name, use_cuda, batch_size),
          flush=True)
    print("teacher_forcing_schedule=", teacher_forcing_schedule, flush=True)
    print(
        "keep_prob=%f, val_size=%f, lr=%f, decoder_type=%s, vocab_limit=%i, hidden_size=%i, embedding_size=%i, max_length=%i, seed=%i"
        % (keep_prob, val_size, lr, decoder_type, vocab_limit, hidden_size,
           embedding_size, max_length, seed),
        flush=True)

    glove = get_glove()

    currentDT = datetime.datetime.now()
    seeds = [8, 23, 10, 41, 32]

    all_means_seen = []
    all_means_unseen = []
    all_means_mixed = []

    print('Testing using ' + test_data)

    mixed_src, mixed_tgt = load_complete_data('twophrase_1seen1unseen_clean')
    test_src, test_tgt = load_complete_data(test_data)

    # Repeat for the listed iterations #
    for it in range(5):
        print("creating training, and validation datasets", flush=True)
        all_datasets = generateKFoldDatasets(
            train_data,
            vocab_limit=vocab_limit,
            use_extended_vocab=(decoder_type == 'copy'),
            seed=seeds[it])
        all_accuracies_seen = []
        all_accuracies_unseen = []
        all_accuracies_mixed = []

        # Repeat for all k dataset folds #
        for k in range(len(all_datasets)):
            train_dataset = all_datasets[k][0]
            val_dataset = all_datasets[k][1]

            print("creating {}th encoder-decoder model".format(k), flush=True)
            encoder_decoder = EncoderDecoder(train_dataset.lang, max_length,
                                             hidden_size, embedding_size,
                                             decoder_type, device, glove)

            test_dataset = SequencePairDataset(
                test_src,
                test_tgt,
                lang=train_dataset.lang,
                use_extended_vocab=(encoder_decoder.decoder_type == 'copy'))

            mixed_dataset = SequencePairDataset(
                mixed_src,
                mixed_tgt,
                lang=train_dataset.lang,
                use_extended_vocab=(encoder_decoder.decoder_type == 'copy'))

            encoder_decoder = encoder_decoder.to(device)

            train_data_loader = DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=12)
            val_data_loader = DataLoader(val_dataset, batch_size=batch_size)
            mixed_data_loader = DataLoader(mixed_dataset,
                                           batch_size=batch_size)
            test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

            # Make a dir to hold this fold's model
            os.mkdir(model_path + str(k) + str(it) + '/')
            # Train the model
            train(encoder_decoder, train_data_loader,
                  model_name + str(k) + str(it), val_data_loader, keep_prob,
                  teacher_forcing_schedule, lr,
                  encoder_decoder.decoder.max_length, device, test_data_loader)

            # Change the model path so the proper model can be loaded
            model_path = './model/' + model_name + str(k) + str(it) + '/'
            # Load the trained model before testing, just in case
            trained_model = torch.load(model_path + model_name +
                                       '{}{}_final.pt'.format(k, it))

            ### WRITE TO ALL THE LOG FILES ##
            s_f = open(
                "./logs/log_" + model_name + "seen" +
                currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")
            s_f.write("TRAINING MODEL {}\nUSING SEED VALUE {}\n\n".format(
                model_name, seeds[it]))
            sc_f = open(
                "./logs/log_" + model_name + "seencorrect" +
                currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")
            with torch.no_grad():
                seen_accuracy = test(trained_model,
                                     val_data_loader,
                                     encoder_decoder.decoder.max_length,
                                     device,
                                     log_files=(s_f, sc_f))
            s_f.close()
            sc_f.close()

            m_f = open(
                "./logs/log_" + model_name + "1seen1unseen" +
                currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")
            m_f.write("TRAINING MODEL {}\nUSING SEED VALUE {}\n\n".format(
                model_name, seeds[it]))
            mc_f = open(
                "./logs/log_" + model_name + "1seen1unseencorrect" +
                currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")
            with torch.no_grad():
                mixed_accuracy = test(trained_model,
                                      mixed_data_loader,
                                      encoder_decoder.decoder.max_length,
                                      device,
                                      log_files=(m_f, mc_f))
            m_f.close()
            mc_f.close()

            u_f = open(
                "./logs/log_" + model_name + "unseen" +
                currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")
            u_f.write("TRAINING MODEL {}\nUSING SEED VALUE {}\n\n".format(
                model_name, seeds[it]))
            uc_f = open(
                "./logs/log_" + model_name + "unseencorrect" +
                currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")
            with torch.no_grad():
                unseen_accuracy = test(trained_model,
                                       test_data_loader,
                                       encoder_decoder.decoder.max_length,
                                       device,
                                       log_files=(u_f, uc_f))
            u_f.close()
            uc_f.close()

            # Append the accuracies for this model so they can be averaged
            all_accuracies_seen.append(seen_accuracy)
            all_accuracies_unseen.append(unseen_accuracy)
            all_accuracies_mixed.append(mixed_accuracy)

            # Reset model path
            model_path = './model/' + model_name

        # Average all the accuracies for this round of cross val and record the mean
        s_mean = sum(all_accuracies_seen) / len(all_accuracies_seen)
        m_mean = sum(all_accuracies_mixed) / len(all_accuracies_mixed)
        u_mean = sum(all_accuracies_unseen) / len(all_accuracies_unseen)
        all_means_seen.append(s_mean)
        all_means_mixed.append(m_mean)
        all_means_unseen.append(u_mean)

    # PRINT FINAL CROSSVAL RESULTS #
    currentDT = datetime.datetime.now()
    acc_f = open(
        "./results/results_" + model_name +
        currentDT.strftime("%Y%m%d%H%M%S") + ".txt", "w")

    # Seen accuracies
    acc_f.write("SEEN ACCURACIES:\n")
    for acc in all_means_seen:
        acc_f.write("{}\n".format(acc))

    s_mean = sum(all_means_seen) / len(all_means_seen)
    s_std_dev = math.sqrt(
        sum([math.pow(x - s_mean, 2)
             for x in all_means_seen]) / len(all_means_seen))
    acc_f.write("\nMean: {}\nStandard Deviation: {}\n".format(
        s_mean, s_std_dev))

    # One seen one unseen accuracies
    acc_f.write("ONE SEEN ONE UNSEEN ACCURACIES:\n")
    for acc in all_means_mixed:
        acc_f.write("{}\n".format(acc))

    m_mean = sum(all_means_mixed) / len(all_means_mixed)
    m_std_dev = math.sqrt(
        sum([math.pow(x - m_mean, 2)
             for x in all_means_mixed]) / len(all_means_mixed))
    acc_f.write("\nMean: {}\nStandard Deviation: {}\n".format(
        m_mean, m_std_dev))

    # Unseen accuracies
    acc_f.write("\nUNSEEN ACCURACIES:\n")
    for acc in all_means_unseen:
        acc_f.write("{}\n".format(acc))

    u_mean = sum(all_means_unseen) / len(all_means_unseen)
    u_std_dev = math.sqrt(
        sum([math.pow(x - u_mean, 2)
             for x in all_means_unseen]) / len(all_means_unseen))
    acc_f.write("\nMean: {}\nStandard Deviation: {}\n".format(
        u_mean, u_std_dev))
    acc_f.close()
Exemplo n.º 2
0
def main(model_name, use_cuda, batch_size, teacher_forcing_schedule, keep_prob, val_size, lr, decoder_type, vocab_limit, hidden_size, embedding_size, max_length, main_data, test_data, device, seed=42):
    print("Max Length is: ", max_length)
    model_path = './model/' + model_name + '/'

    print("training %s with use_cuda=%s, batch_size=%i"% (model_name, use_cuda, batch_size), flush=True)
    print("teacher_forcing_schedule=", teacher_forcing_schedule, flush=True)
    print("keep_prob=%f, val_size=%f, lr=%f, decoder_type=%s, vocab_limit=%i, hidden_size=%i, embedding_size=%i, max_length=%i, seed=%i" % (keep_prob, val_size, lr, decoder_type, vocab_limit, hidden_size, embedding_size, max_length, seed), flush=True)

    train_src, train_tgt, val_src, val_tgt = load_split_eighty_twenty(main_data, seed)

    if test_data:
        test_src, test_tgt = load_complete_data(test_data)
    
    if os.path.isdir(model_path):

        print("loading encoder and decoder from model_path", flush=True)
        encoder_decoder = torch.load(model_path + model_name + '_final.pt')

        print("creating training, validation, and testing datasets with saved languages", flush=True)
        train_dataset = SequencePairDataset(train_src, train_tgt,
                                            lang=encoder_decoder.lang,
                                            use_extended_vocab=(encoder_decoder.decoder_type=='copy'))

        val_dataset = SequencePairDataset(val_src, val_tgt,
                                          lang=encoder_decoder.lang,
                                          use_extended_vocab=(encoder_decoder.decoder_type=='copy'))

        test_dataset = SequencePairDataset(test_src, test_tgt,
                                            lang=train_dataset.lang,
                                            use_extended_vocab=(encoder_decoder.decoder_type=='copy'))

    else:
        os.mkdir(model_path)

        print("creating training, validation, and testing datasets", flush=True)
        train_dataset = SequencePairDataset(train_src, train_tgt,
                                            vocab_limit=vocab_limit,
                                            use_extended_vocab=(decoder_type=='copy'))

        val_dataset = SequencePairDataset(val_src, val_tgt,
                                          lang=train_dataset.lang,
                                          use_extended_vocab=(decoder_type=='copy'))

        test_dataset = SequencePairDataset(test_src, test_tgt,
                                            lang=train_dataset.lang,
                                            use_extended_vocab=(decoder_type=='copy'))

        print("creating encoder-decoder model", flush=True)
        encoder_decoder = EncoderDecoder(train_dataset.lang,
                                         max_length,
                                         embedding_size,
                                         hidden_size,
                                         decoder_type,
                                         device)

        torch.save(encoder_decoder, model_path + '/%s.pt' % model_name)

    encoder_decoder = encoder_decoder.to(device)

    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12)
    val_data_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

    mixed_data_loader = DataLoader(mixed_dataset, batch_size=batch_size)

    trained_model = train(encoder_decoder,
          train_data_loader,
          model_name,
          val_data_loader,
          keep_prob,
          teacher_forcing_schedule,
          lr,
          encoder_decoder.decoder.max_length,
          device)

    trained_model = torch.load(model_path + model_name + '_final.pt')

    # Write final model errors to an output log
    if test_data:
        f = open("./logs/log_" + model_name + ".txt", "w")
        f.write("MODEL {}\n\n".format(model_name))
        f.write("UNSEEN ACCURACY\n")
        with torch.no_grad():
            accuracy = test(trained_model, test_data_loader, encoder_decoder.decoder.max_length, device, log_file=f)
        f.close()