Python get_train_test_split示例，src.utils.preprocess.get_train_test_split Python示例

示例#1

0

显示文件

文件： predict_all.py 项目： ygurman/ukp_argmining_rnn

def main(mode, config_file_path, trained_model_path):
    # train the segmentor-classifier first
    h_params = HyperParams(config_file_path)
    from src.utils.preprocess import get_train_test_split
    from src.utils.preprocess import prepare_data

    torch.manual_seed(h_params.rand_seed)
    model_dir = h_params.models_dir
    all_models = os.listdir(model_dir)
    only_best = [m for m in all_models
                 if m.find("2018-12-15") != -1]  # delete TODO
    for model_name in only_best:
        h_params.use_pos = True if model_name.find("no_POS") == -1 else False
        for mode in [
                DivisionResolution.ESSAY, DivisionResolution.SENTENCE,
                DivisionResolution.PARAGRAPH
        ]:
            _, test_files = get_train_test_split(
                os.path.abspath(
                    os.path.join(h_params.data_dir, "train-test-split.csv")))
            test_data, ept_offsets = prepare_data(mode, test_files,
                                                  h_params.data_dir)
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            SegmentorClassifier = BiLSTM_Segmentor_Classifier if h_params.use_pos else BiLSTM_Segmentor_Classifier_no_pos
            model = SegmentorClassifier(
                h_params.d_word_embd, h_params.d_pos_embd, h_params.d_h1,
                h_params.n_lstm_layers, h_params.word_voc_size,
                h_params.pos_voc_size, h_params.ac_tagset_size,
                h_params.batch_size, device,
                h_params.pretraind_embd_layer_path)
            # load trained model state-dict
            checkpoint = torch.load(os.path.join(model_dir, model_name))
            model.load_state_dict(checkpoint['model_state_dict'])
            ## set CUDA if available
            if torch.cuda.is_available():
                model.cuda()
            # set evaluation mode mode
            model.eval()
            # inference for all chosen data
            preds = []
            with torch.no_grad():
                for (indexed_tokens, indexed_POSs,
                     indexed_AC_tags) in test_data:
                    tag_scores = model((
                        indexed_tokens.to(device),
                        indexed_POSs.to(device)))  # get log soft max for input
                    preds.append(torch.argmax(tag_scores, dim=1).tolist())
            # post-process for fine tuning
            ac_tag2ix = pickle.load(
                open(os.path.join(h_params.vocab_dir, "ac_tag2ix.pcl"), 'rb'))
            corrected_tags = post_process(preds, ac_tag2ix)
            # save results
            #results_file = os.path.join("..","exps",os.path.split(trained_model_path)[-1][:-3]+".results")
            results_file = os.path.join(
                h_params.exps_dir,
                "{}|{}.results".format(model_name[:-3], mode))

            true_tags = [ac_tags.tolist() for _, _, ac_tags in test_data]
            with open(results_file, 'wt') as f:
                # write header for file
                f.write("\t".join(("# essay_paragraph_token_index",
                                   "true AC-tag", "predicted AC-tag",
                                   "post processed AC tag")) + '\n')
                # iterate over results (by appropriate devision)
                for i_seq in range(len(preds)):
                    for i_tok in range(len(preds[i_seq])):
                        e_p_t_index = ept_offsets[i_seq][i_tok]
                        true_tag = true_tags[i_seq][i_tok]
                        predicted_ac_tag = preds[i_seq][i_tok]
                        post_processed_tag = corrected_tags[i_seq][i_tok]
                        f.write("\t".join(
                            (str(e_p_t_index), str(true_tag),
                             str(predicted_ac_tag), str(post_processed_tag))))
                        f.write('\n')

            sys.stdout.write("finished predictions and saved to {}".format(
                os.path.abspath(results_file)))

示例#2

0

显示文件

def main(mode, config_file_path):
    # train the segmentor-classifier first
    h_params = HyperParams(config_file_path)
    from src.utils.preprocess import get_train_test_split
    from src.utils.preprocess import prepare_data

    torch.manual_seed(h_params.rand_seed)

    training_files, _ = get_train_test_split(
        os.path.abspath(os.path.join("..", "data", "train-test-split.csv")))
    training_data, _ = prepare_data(mode,
                                    training_files,
                                    data_path=h_params.data_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    SegmentorClassifier = BiLSTM_Segmentor_Classifier if h_params.use_pos else BiLSTM_Segmentor_Classifier_no_pos
    model = SegmentorClassifier(h_params.d_word_embd, h_params.d_pos_embd,
                                h_params.d_h1, h_params.n_lstm_layers,
                                h_params.word_voc_size, h_params.pos_voc_size,
                                h_params.ac_tagset_size, h_params.batch_size,
                                device, h_params.pretraind_embd_layer_path)

    # set loss function and adam optimizer (using negative log likelihood with adam optimizer
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=h_params.learning_rate,
                           weight_decay=h_params.weight_decay)

    ## set CUDA if available
    if torch.cuda.is_available():
        model.cuda()
        loss_function.cuda()

    # display parameters in model
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

    # display optimizers paramersparamete
    print("Optimizer's state_dict:")
    for var_name in optimizer.state_dict():
        print(var_name, "\t", optimizer.state_dict()[var_name])

    # set train mode
    model.train()
    for epoch in range(h_params.n_epochs):
        start_time = time.time()
        acc_loss = 0.0  # accumalating loss per epoch for display
        for (indexed_tokens, indexed_POSs,
             indexed_AC_tags) in tqdm(training_data):
            # reset accumalated gradients and lstm's hidden state between iterations
            model.zero_grad()
            model.hidden1 = model.init_hidden(model.h1dimension)

            # make a forward pass
            tag_scores = model(
                (indexed_tokens.to(device), indexed_POSs.to(device)))

            # backprop
            loss = loss_function(tag_scores, indexed_AC_tags.to(device))
            acc_loss += loss.item()
            loss.backward()

            # gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           h_params.clip_threshold)

            # call optimizer step
            optimizer.step()
        end_time = time.time()
        # output stats
        sys.stdout.write(
            "===> Epoch[{}/{}]: Loss: {:.4f} , time = {:d}[s]\n".format(
                epoch + 1, h_params.n_epochs, acc_loss,
                int(end_time - start_time)))

        if epoch in [25, 50, 75]:
            try:
                torch.save(
                    {
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss
                    },
                    os.path.abspath(
                        os.path.join(
                            h_params.models_dir,
                            "{}_SegClass_mode-{}_ep-{}_{}.pt".format(
                                str(date.today()), mode, epoch,
                                "no_POS" if not h_params.use_pos else ""))))
            except:
                sys.stdout.write(
                    'failed to save model in epoch {}\n'.format(epoch))

    # save model
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
        },
        os.path.abspath(
            os.path.join(
                h_params.models_dir, "{}_SegClass_{}_ep-{}_{}.pt".format(
                    str(date.today()), mode, epoch,
                    "no_POS" if not h_params.use_pos else ""))))

    #announce end
    sys.stdout.write("finished training")

示例#3

0

显示文件

文件： train.py 项目： ygurman/ukp_argmining_rnn

def main(config_file_path, baseline):
    # manual random seed
    h_params: HyperParams = HyperParams(config_file_path)
    torch.manual_seed(h_params.rand_seed)

    training_files, _ = get_train_test_split(
        os.path.abspath(os.path.join("..", "data", "train-test-split.csv")))
    training_data = prepare_relations_data(
        training_files, h_params.data_dir, h_params.vocab_dir, save=False
    )  # list of (ac_dict, [(ac_id,ac_id),type]) tupels for each essay

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # define use of constructed features aided model or bland lstm - basically a 2 layered consequtive lstm with further 2 linear layers and ReLU activations
    if baseline:
        RelationClassifier = BaselineConstructedRelationClassifier if h_params.d_distance_embd != 0 else BaselineRelationClassifier
    else:
        RelationClassifier = BiLSTMRelationClassifier if h_params.d_distance_embd != 0 else BlandRelationClassifier

    model = RelationClassifier(h_params.d_word_embd, h_params.d_pos_embd,
                               h_params.d_h1, h_params.n_lstm_layers,
                               h_params.word_voc_size, h_params.pos_voc_size,
                               h_params.ac_tagset_size, h_params.batch_size,
                               device, h_params.pretraind_embd_layer_path,
                               h_params.rel_tagset_size, h_params.d_tag_embd,
                               h_params.d_small_embd, h_params.d_distance_embd,
                               h_params.d_h2, h_params.d_h3)

    model.to(device)
    # if using previous trained model weights for transfer learning (weights)
    if h_params.pretrained_segmentor_path:
        checkpoint = torch.load(h_params.pretrained_segmentor_path)
        pre_trained_state_dict = dict(checkpoint['model_state_dict'])
        model_dict = dict(model.state_dict())
        # filter unused keys
        pre_trained_state_dict = {
            param: value
            for param, value in pre_trained_state_dict.items()
            if param in model_dict
        }
        # overwrite new parametes in the model dictionary
        for param, value in pre_trained_state_dict.items():
            model_dict[param] = value
        # update state dict in the model
        model.load_state_dict(model_dict)

    # set loss function and adam optimizer (using negative log likelihood with adam optimizer
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=h_params.learning_rate,
                           weight_decay=h_params.weight_decay)

    ## set CUDA if available
    if torch.cuda.is_available():
        model.cuda()
        loss_function.cuda()

    # display parameters in model
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

    # display optimizers paramersparamete
    print("Optimizer's state_dict:")
    for var_name in optimizer.state_dict():
        print(var_name, "\t", optimizer.state_dict()[var_name])

    # set train mode
    model.train()
    save_name = "{}{}".format(
        model.__class__.__name__,
        "_transfer" if h_params.pretrained_segmentor_path else "")

    log = open("/home/yochay/ukp_argmining_rnn/logs/" + save_name + ".log",
               'wt')
    for epoch in range(h_params.n_epochs):
        start_time = time.time()
        acc_loss = 0.0  # accumalating loss per epoch for display
        for (ac_dict, ac_pairs, rel_tags) in tqdm(training_data):
            for i_rel in range(len(ac_pairs)):
                a_id, b_id = ac_pairs[i_rel][0], ac_pairs[i_rel][1]
                try:
                    ac_a, ac_b = ac_dict[a_id], ac_dict[b_id]

                    # reset accumalated gradients and lstm's hidden state between iterations
                    model.zero_grad()
                    model.hidden1 = model.init_hidden(model.h1dimension)
                    if not baseline:
                        model.hidden2 = model.init_hidden(model.h2dimension)

                    # make a forward pass
                    tag_scores = model((ac_a, ac_b))

                    # backprop
                    loss = loss_function(tag_scores,
                                         rel_tags[i_rel].view(1).to(device))
                    acc_loss += loss.item()
                    loss.backward()

                    # gradient clipping
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   h_params.clip_threshold)

                    # call optimizer step
                    optimizer.step()

                except KeyError:
                    try:
                        log.write(
                            "essay {}\tids({},{}) -bad preprocess\n".format(
                                ac_dict[0].essay, a_id, b_id))
                    except:
                        pass
        end_time = time.time()
        # output stats
        sys.stdout.write(
            "===> Epoch[{}/{}]: Loss: {:.4f} , time = {:d}[s]\n".format(
                epoch + 1, h_params.n_epochs, acc_loss,
                int(end_time - start_time)))

        if (epoch + 1) % 25 == 0 or epoch in [2, 4, 9]:
            try:
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss
                    },
                    os.path.abspath(
                        os.path.join(
                            h_params.models_dir,
                            "{}_ep-{}.pt".format(save_name, epoch + 1))))
            except:
                sys.stdout.write(
                    'failed to save model in epoch {}\n'.format(epoch + 1))
    log.close()

    # save model
    torch.save(
        {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            +- 'loss': loss
        },
        os.path.abspath(
            os.path.join(h_params.models_dir,
                         "{}_ep-{}.pt".format(save_name, epoch + 1))))

    # announce end
    sys.stdout.write("finished training")

示例#4

0

显示文件

文件： stats.py 项目： ygurman/ukp_argmining_rnn

# check how many paragraphs, sentences and essays in train and data splits
import os
import sys

from src.utils.preprocess import get_train_test_split

data_path = os.path.join("..", "data")
train_files, test_files = get_train_test_split(
    os.path.join(data_path, 'train-test-split.csv'))
for ds in (train_files, test_files):
    n_tokens = 0
    n_sents = 0
    n_paragraphs = 0
    n_essays = 0
    len_para = 0
    for essay in ds:
        n_essays += 1
        i_line = 0
        with open(os.path.join(data_path, "processed", essay + ".tsv")) as f:
            for line in f:
                i_line += 1
                if line[:3] == "# p":
                    if len_para == 0:
                        print("ess:{}\tline:{}".format(essay, i_line))
                    n_paragraphs += 1
                    len_para = 0
                elif line[:3] == "# s":
                    n_sents += 1
                else:
                    n_tokens += 1
                    len_para += 1

示例#5

0

显示文件

文件： predict.py 项目： ygurman/ukp_argmining_rnn

def main(config_file_path, trained_model_path, use_gold_segmentation):
    # get hyper parameters
    h_params = HyperParams(config_file_path)

    torch.manual_seed(h_params.rand_seed)
    _, test_files = get_train_test_split(
        os.path.abspath(os.path.join("..", "data", "train-test-split.csv")))
    test_data = prepare_relations_data(files=test_files,
                                       data_dir=os.path.join(
                                           h_params.exps_dir, "best_results"),
                                       vocab_dir=h_params.vocab_dir,
                                       save=True)
    gold_data = prepare_relations_data(files=test_files,
                                       data_dir=os.path.join(
                                           h_params.data_dir, "processed"),
                                       vocab_dir=h_params.vocab_dir,
                                       save=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if trained_model_path.find("BiLSTMRelationClassifier_transfer") != 1:
        RelationsClassifier = BiLSTMRelationClassifier
    elif trained_model_path.find("BlandRelationClassifier") != 1:
        RelationsClassifier = BlandRelationClassifier
    elif trained_model_path.find("BaselineConstructedRelationClassifier") != 1:
        RelationsClassifier = BaselineConstructedRelationClassifier
    else:
        RelationsClassifier = BaselineRelationClassifier

    model = RelationsClassifier(
        h_params.d_word_embd, h_params.d_pos_embd, h_params.d_h1,
        h_params.n_lstm_layers, h_params.word_voc_size, h_params.pos_voc_size,
        h_params.ac_tagset_size, h_params.batch_size, device,
        h_params.pretraind_embd_layer_path, h_params.rel_tagset_size,
        h_params.d_tag_embd, h_params.d_small_embd, h_params.d_distance_embd,
        h_params.d_h2, h_params.d_h3)

    # load trained model state-dict
    checkpoint = torch.load(trained_model_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    ## set CUDA if available
    if torch.cuda.is_available():
        model.cuda()

    # set evaluation mode mode
    model.eval()

    # inference for all chosen data
    preds = []
    essay_pairs_types = []
    true_rel_tag = []
    with torch.no_grad():
        for (ac_dict, ac_pairs, rel_tags) in test_data:
            try:
                a_id, b_id = ac_pairs[0], ac_pairs[1]
                ac_a, ac_b = ac_dict[a_id], ac_dict[b_id]
                tag_scores = model((ac_a, ac_b))  # get log soft max for input
                preds.append(torch.argmax(tag_scores, dim=1).tolist())
                essay_pairs_types.append(
                    (ac_a.essay, ac_pairs, ac_a.type, ac_b.type))
                true_rel_tag.append(rel_tags[(ac_a, ac_b)])
            except:
                pass  # for debug - bad preprocessed files

    # save results
    results_file = os.path.join(
        h_params.exps_dir,
        os.path.split(trained_model_path)[-1][:-3] + ".results")

    with open(results_file, 'wt') as f:
        # write header for file
        f.write("\t".join(("#essay", "ac_id_pairs", "ac_a type", "ac_b type",
                           "prediction", "y_true")) + '\n')
        # iterate over results (by appropriate devision)
        for i_pred in range(len(preds)):
            essay, pair, a_type, b_type = essay_pairs_types[i_pred]
            pred = preds[i_pred]
            true = true_rel_tag[i_pred]
            f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(essay, pair, a_type,
                                                      b_type, pred, true))

    sys.stdout.write("finished predictions and saved to {}".format(
        os.path.abspath(results_file)))