示例#1
0
def get_shallow_stats(x_path):
    X = read_csv(x_path)
    sentences = build_sentences(X, k=1)

    word_count = 0
    vocab = set()
    for sentence in sentences:
        for word in sentence:
            word_count += 1
            vocab.add(word)

    return len(sentences), word_count, len(vocab)
示例#2
0
from data import read_csv, build_sentences_labels, handle_uncommon_words, handle_unknown_words, build_sentences, handle_unknown_sentences
from probs import build_emission_map, build_transition_map


if __name__ == '__main__':
    X_train = read_csv("./data/dev_x.csv")
    X_labels = read_csv("./data/dev_y.csv")

    sentences, labels = build_sentences_labels(X_train, X_labels, k=2)
    sentences = handle_uncommon_words(sentences)

    #transition_map = build_transition_map(labels)
    #emission_map = build_emission_map(sentences, labels)
    vocab = set()
    for sentence in sentences:
        for word in sentence:
            vocab.add(word)

    not_found = []
    test_sentences = build_sentences(read_csv('./data/test_x.csv'), k=2)
    test_sentences = handle_unknown_sentences(test_sentences, vocab)
    for test_sentence in test_sentences:
        for test_word in test_sentence:
            if test_word not in vocab:
                not_found.append(test_word)

    with open('output.txt', 'w') as f:
        for word in not_found:
            f.write(f"{word}\n")
        print('  - (Training)   accuracy: {accu:3.3f} %, '\
              'elapse: {elapse:3.3f} min'.format(
                  accu=100*train_accu,
                  elapse=(time.time()-start)/60))

        start = time.time()
        valid_loss, valid_accu = eval_epoch(model, validation_data, predicates)
        print('  - (Validation)  accuracy: {accu:3.3f} %, '\
                'elapse: {elapse:3.3f} min'.format(
                    accu=100*valid_accu,
                    elapse=(time.time()-start)/60))

        valid_accus += [valid_accu]

device = torch.device('cpu')


word2idx,ints,en1_pos,en2_pos,predicates,relation2idx = data.build_sentences()

training_data, validation_data = prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates)
model = Transformer(
    n_src_vocab=len(word2idx),
    len_max_seq=config.max_seq_len).to(device)

optimizer = ScheduledOptim(
    optim.Adam(
        filter(lambda x: x.requires_grad, model.parameters()),
        betas=(0.9, 0.98), eps=1e-09),
    512, 1000)

train(model, training_data, validation_data, optimizer,predicates)
示例#4
0
        for mp_idx in range(1, len(backpointers)):
            result.append(prev)
            mp = backpointers[mp_idx][prev]
            #print(f"{prev} -> {mp}")
            prev = mp
        result.reverse()
        final = final + result
    return final


X_train = read_csv("./data/train_x.csv")
Y_train = read_csv("./data/train_y.csv")
sentences, labels = build_sentences_labels(X_train, Y_train, k=1)
# sentences = handle_uncommon_words(sentences, threshold=3)
vocab = get_vocab_sentences(sentences)

emission_map = build_emission_map(sentences, labels)
transition_map = build_transition_map(labels)

X_test = read_csv("./data/dev_x.csv")
y_dev = read_csv("./data/dev_y.csv")
sentences_test = build_sentences(X_test, k=1)
# sentences_test = handle_unknown_sentences(sentences_test, vocab)

suffix_dict = generate_suffix_dict(X_train, Y_train)

pred_dev = viterbi(sentences_test, emission_map, transition_map, suffix_dict)
print('Accuracy:', compAccu(X_test, y_dev, pred_dev, vocab))

print(emission("'s", emission_map))