示例#1
0
def build_graph(config):

    word2idx, idx2word = get_vocabs(config['vocab_file'])
    embeddings = get_embeddings(word2idx, config['s2v_file'])

    weights = config.get('weights', [1 for _ in config['metrics']])
    assert len(config['metrics']) == len(weights)
    metrics = {m: {'weight': w} for m, w in zip(config['metrics'], weights)}

    if 'lm' in metrics:
        metrics['lm'].update(
            dict(forward=config['lm_save_dir'],
                 reverse=config.get('lm_rev_save_dir', None),
                 num_words=len(word2idx)))

    if 'cos' in metrics:
        idf_file = config.get('idf_file', None)
        if idf_file is not None:
            metrics['cos'].update(
                dict(idf=get_idf_vector(idf_file, word2idx),
                     embeddings=embeddings))
        else:
            metrics['cos'].update(dict(embeddings=embeddings))

    sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    sess = tf.Session(config=sess_config)

    model_inputs, model_outputs = get_model(metrics, mode=config['mode'])

    if 'lm' in metrics:
        init_lm_checkpoints(metrics['lm'])
    sess.run(tf.global_variables_initializer())
    return sess, model_inputs, model_outputs, embeddings, word2idx, idx2word
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab = list(vocab)
    vocab.insert(0, PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    vocab_chars = list(vocab_chars)
    vocab_chars.insert(0, PAD)
    write_vocab(vocab_chars, config.chars_filename)

    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)
示例#3
0
def prepare(args, config):
    word2idx, idx2word = get_vocabs(args.vocab_file)
    try:
        embeddings = get_embeddings(word2idx, args.w2v_file)
    except FileNotFoundError:
        logging.info(
            'embedding file not found. Train embeddings from scratch instead')
        embeddings = None
    with tf.variable_scope('LanguageModel'):
        model_inputs, model_outputs = get_model(config, embeddings,
                                                len(word2idx))

    sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    sess = tf.Session(config=sess_config)
    sess.run(tf.global_variables_initializer())

    return word2idx, model_inputs, model_outputs, sess
示例#4
0
import numpy as np

from chu_liu_edmonds import decode_mst
from utils import get_vocabs, nll_loss, UAS
from models import model_1, model_2
from hp import hp_dict

from data_reader import PosDataset

data_dir = "C:\\Users\\jeremy.levy\\OneDrive - Technion\\MSc\\Courses\\courses_gal\\NLP\\HW\\HW2 - wet\\HW2-files\\"
# data_dir = "C:\\Users\\galye\\Dropbox\\studies\\MSc\\NLP\\HW2 - wet\\HW2-files\\"

path_train = data_dir + "train.labeled"
path_test = data_dir + "test.labeled"

word_dict, pos_dict = get_vocabs([path_train, path_test])

dataset_saved = False

if dataset_saved is True:
    print("Loading dataset")
    training_sentences = torch.load('training_sentences.pt')
    test_sentences = torch.load('test_sentences.pt')
else:
    print("Extracting dataset")
    training_sentences = PosDataset(path_train,
                                    word_dict,
                                    pos_dict,
                                    padding=False)
    test_sentences = PosDataset(path_test, word_dict, pos_dict, padding=False)
示例#5
0
                torch.tensor(pos_idx_list,
                             dtype=torch.long,
                             requires_grad=False))
            sentence_len_list.append(sentence_len)

        # if padding:
        #     all_sentence_word_idx = torch.tensor(sentence_word_idx_list, dtype=torch.long)
        #     all_sentence_pos_idx = torch.tensor(sentence_pos_idx_list, dtype=torch.long)
        #     all_sentence_len = torch.tensor(sentence_len_list, dtype=torch.long, requires_grad=False)
        #     return TensorDataset(all_sentence_word_idx, all_sentence_pos_idx, all_sentence_len)

        return {
            i: sample_tuple
            for i, sample_tuple in enumerate(
                zip(sentence_word_idx_list, sentence_pos_idx_list,
                    sentence_len_list))
        }


if __name__ == "__main__":
    path_train = "data_new/train.labeled"
    path_test = "data_new/test.labeled"
    paths_list = [path_train, path_test]
    word_dict, pos_dict = get_vocabs(paths_list)
    train = DepDataset(word_dict, pos_dict, 'data_new', 'train', padding=False)
    train_dataloader = DataLoader(train, shuffle=True)
    test = DepDataset(word_dict, pos_dict, 'data_new', 'test', padding=False)
    test_dataloader = DataLoader(test, shuffle=False)
    print("Number of Train Tagged Sentences ", len(train))
    print("Number of Test Tagged Sentences ", len(test))
示例#6
0
        with torch.no_grad():
            words_idx_tensor, pos_idx_tensor, heads_tensor = input_data
            tag_scores = model(words_idx_tensor, pos_idx_tensor)
            predicted_mst, _ = decode_mst(energy=tag_scores.detach().cpu(),
                                          length=tag_scores.shape[0],
                                          has_labels=False)
            tags.append(predicted_mst[1:])
    return tags


# create data sets
data_dir = "HW2-files/"
path_train = data_dir + "train.labeled"
path_test = data_dir + "test.labeled"
paths_list = [path_train, path_test]
word_cnt, word_dict, pos_dict = utils.get_vocabs(paths_list)
train = utils.PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'train')
train_dataloader = utils.DataLoader(train, shuffle=True)
test = utils.PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'test')
test_dataloader = utils.DataLoader(test, shuffle=False)
word_vocab_size = len(train.word2idx)
tag_vocab_size = len(train.pos_idx_mappings)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

# create and load trained model
base_model = basic_model.DnnDependencyParser(basic_model.WORD_EMBEDDING_DIM,
                                             basic_model.POS_EMBEDDING_DIM,
                                             basic_model.HIDDEN_DIM,
                                             word_vocab_size,