예제 #1
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    #test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_glove_uni = get_glove_vocab(config.glove_uni_filename)

    vocab_feature = get_pos_glove_vocab(config.glove_filename)

    # vocab = vocab_words & vocab_glove
    vocab = vocab_glove | vocab_words
    vocab.add(UNK)
    vocab.add(NUM)

    vocab_pos = vocab_feature
    vocab_pos.add(UNK)
    vocab_pos.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_glove_uni, config.uni_words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_pos, config.pos_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.t_dim)

    vocab = load_vocab(config.uni_words_filename)

    export_trimmed_uni_vectors(vocab, config.NEdic_filename,
                               config.trimmed_dic, config.dic_dim)

    export_trimmed_uni_vectors(vocab, config.glove_uni_filename,
                               config.uni_trimmed_filename, config.dim)

    vocab_feature = load_vocab(config.pos_filename)
    export_trimmed_pos_vectors(vocab_feature, config.glove_feature,
                               config.feature_trimmed_filename, config.pos_dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
예제 #2
0
def build_joint_vocab(config):

    # Common options for all datasets
    processing_word = get_processing_word(lowercase=True)
    vocab_glove = get_glove_vocab(config.filename_glove)

    # Compute and save individual vocab
    v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word,
                                             vocab_glove)
    v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word,
                                             vocab_glove)
    v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word,
                                          vocab_glove)

    print(" *** Joint vocabulary ***")
    vocab_words = v1_words.union(v2_words, v3_words)
    vocab_chars = v1_chars.union(v2_chars, v3_chars)

    # Save combined vocab
    write_vocab(vocab_words, config.filename_words)
    write_vocab(vocab_chars, config.filename_chars)

    # Trim GloVe Vectors
    vocab = load_vocab(config.filename_words)
    export_trimmed_glove_vectors(vocab, config.filename_glove,
                                 config.filename_trimmed, config.dim_word)
예제 #3
0
def build_data(config):
    processing_word = get_processing_word()

    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(vocab_poss, config.poss_filename)

    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
예제 #4
0
def build_data(config):
    """
    Procedure to build data
    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
예제 #5
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)
    processing_word = get_processing_word(lowercase=True)

    # clean data
    train_filepath, dev_filepath_a = write_clear_data(
        config.train_filename,
        build_dev=config.build_dev_from_trainset,
        dev_ratio=config.dev_ratio)
    test_filepath, dev_filepath_b = write_clear_data(
        config.test_filename,
        build_dev=config.build_dev_from_testset,
        dev_ratio=config.dev_ratio)
    dev_filepath = dev_filepath_a or dev_filepath_b

    # Generators
    dev = Dataset(dev_filepath, processing_word)
    test = Dataset(test_filepath, processing_word)
    train = Dataset(train_filepath, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = Dataset(train_filepath)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)
    vocab.add(PAD)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename, processing_word)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)


    # Build and save type vocab
    vocab_types = set()
    print len(vocab_tags)
    for tag in vocab_tags:
        if tag != 'O':
            vocab_types.add(tag[2:])
    write_vocab(vocab_types, config.types_filename)
예제 #7
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=config.lowercase)

    # Generators
    dev   = CoNLLDataset(config.dev_filename, processing_word)
    test  = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs([train, dev, test])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename, 
                                config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
예제 #8
0
def main():
    # get config
    config = Config(load=False)

    # Generators
    train = get_datasets(config.filename_train)
    valid = get_datasets(config.filename_valid)
    test = get_datasets(config.filename_test)

    # add <start> to glove
    # add_glove(config.filename_glove, config.dim_word)

    # Build word vocab
    train_words = get_train_vocab(train)
    glove_vocab = get_glove_vocab(config.filename_glove)

    # train & glove(word to index)
    vocab = word2index(train_words, glove_vocab)
    # save vocab
    write_vocab(config.filename_words, vocab)

    # index to word
    index = index2word(vocab)
    write_vocab(config.filename_index, index)

    # embedding
    glove_embedding(config.filename_glove, config.filename_trimmed_glove,
                    config.dim_word, vocab, config.start, config.pad)

    # trim datasets
    get_trimmed_datasets(config.filename_trimmed_train, train, vocab,
                         config.max_length)
    get_trimmed_datasets(config.filename_trimmed_valid, valid, vocab,
                         config.max_length)
    get_trimmed_datasets(config.filename_trimmed_test, test, vocab,
                         config.max_length)
예제 #9
0
def build_data(config):
    annotations = []
    meta_filename = 'sw%s%s-ms98-a-trans.text'  # % (file_id, speaker_id)

    for idx in os.listdir(config.wimp_corpus):
        idx_path = os.path.join(config.wimp_corpus, idx)
        if os.path.isfile(idx_path):
            continue

        for file_id in os.listdir(idx_path):
            folder = os.path.join(idx_path, file_id)
            if os.path.isfile(folder):
                continue

            wimp_trans_files = [
                os.path.join(folder, meta_filename % (file_id, 'A')),
                os.path.join(folder, meta_filename % (file_id, 'B'))
            ]

            swd_trans_files = [
                os.path.join(config.swd_transcripts, idx, file_id,
                             meta_filename % (file_id, 'A')),
                os.path.join(config.swd_transcripts, idx, file_id,
                             meta_filename % (file_id, 'B'))
            ]

            for i, wimp_trans_file in enumerate(wimp_trans_files):
                swd_trans_file = swd_trans_files[i]
                file_id, speaker = swd_trans_file.split("/")[-2:]
                speaker = speaker[6]
                with open(wimp_trans_file) as w_file_obj, open(
                        swd_trans_file) as s_file_obj:
                    for line_num, (anns_, wrds_) in enumerate(
                            zip(w_file_obj, s_file_obj)):
                        sentence = []
                        anns = anns_.strip().split(' ')[3:]
                        wrds = wrds_.strip().split(' ')[3:]
                        assert(len(anns) == len(wrds)), \
                        "file mismatch, line %d : %s and %s" % (line_num, swd_trans_file, wimp_trans_file)

                        for id_, wrd in enumerate(wrds):
                            wrd = clean_word(wrd)
                            if wrd != '':
                                sentence.append([(file_id, line_num, speaker),
                                                 wrd,
                                                 float(anns[id_])])

                        if len(sentence) != 0:
                            annotations.append(sentence)

    random.shuffle(annotations)

    #80% for training, 10% dev, 10% test
    d_train = annotations[:0.8 * len(annotations)]
    d_test = annotations[0.8 * len(annotations):0.9 * len(annotations)]
    d_dev = annotations[0.9 * len(annotations):]

    def prep_text_data(D, outfile):
        with open(outfile, 'w') as f:
            for sent in D:
                for _, word, label in sent:
                    f.write("%s %f\n" % (word, label))
                f.write("\n")

    prep_text_data(d_train, config.train_filename)
    prep_text_data(d_test, config.test_filename)
    prep_text_data(d_dev, config.dev_filename)

    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = AnnotationDataset(config.dev_filename, processing_word)
    test = AnnotationDataset(config.test_filename, processing_word)
    train = AnnotationDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    # Vocabulary is built using training data
    vocab_words, vocab_tags = get_vocabs([train])
    vocab_glove = get_glove_vocab(config.glove_filename)

    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)

    # Trim GloVe Vectors
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)

    # Build and save char vocab
    train = AnnotationDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
예제 #10
0
def build_data(config):
    """
    Procedure to build data

    Args:
        config: defines attributes needed in the function
    Returns:
        creates vocab files from the datasets
        creates a npz embedding file from trimmed glove vectors
    """
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = CoNLLDataset(config.dev_filename, processing_word)
    test = CoNLLDataset(config.test_filename, processing_word)
    train = CoNLLDataset(config.train_filename, processing_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev,
                                                     test])  #pos adding-----
    vocab_glove = get_glove_vocab(config.glove_filename)
    vocab_dic = get_dic_vocab(config.dic_filename, 1)  #add dic vector get
    vocab_syl = get_dic_vocab(config.syl_filename, 1)  #add syl vector
    vocab_morph = get_morph_vocab(config.morph_vec_filename)  #morph vector get

    vocab = vocab_words & vocab_glove
    vocab.add(UNK.decode('utf-8'))
    vocab.add(NUM.decode('utf-8'))

    word_dic = vocab_dic  #add dic
    word_dic.add(UNK.decode('utf-8'))
    word_dic.add(NUM.decode('utf-8'))

    word_syl = vocab_syl  #add syl
    word_syl.add(UNK.decode('utf-8'))
    word_syl.add(NUM.decode('utf-8'))

    word_morph = vocab_morph  # add morph
    word_morph.add(UNK.decode('utf-8'))
    word_morph.add(NUM.decode('utf-8'))

    vocab_pos.add(UNK.decode('utf-8'))

    # Save vocab
    write_vocab(vocab, config.words_filename)
    write_vocab(vocab_tags, config.tags_filename)
    write_vocab(word_dic, config.word_dic_filename)  #add dic
    write_vocab(word_syl, config.word_syl_filename)  #add syl
    write_vocab(word_morph, config.morphs_filename)  #add morph
    write_vocab(vocab_pos, config.posTag_filename)  #add pos

    # Trim GloVe Vectors(pretrain vector)
    vocab = load_vocab(config.words_filename)
    export_trimmed_glove_vectors(vocab, config.glove_filename,
                                 config.trimmed_filename, config.dim)
    word_dic = load_vocab(config.word_dic_filename)  #dic add
    export_dic_vectors(word_dic, config.dic_filename, config.exported_filename,
                       config.dic_dim)
    word_syl = load_vocab(config.word_syl_filename)  #syl add
    export_syl_vectors(word_syl, config.syl_filename,
                       config.exported_sfilename, config.syl_dim)
    word_morph = load_vocab(config.morphs_filename)  #morph add
    export_morph_vectors(word_morph, config.morph_vec_filename,
                         config.exported_mfilename, config.dim_morph)
    vocab_pos = load_vocab(config.posTag_filename)  #pos add
    export_pos_vectors(vocab_pos, config.pos_vec_filename,
                       config.exported_pfilename, config.dim_pos)

    # Build and save char vocab, morph vocab
    train = CoNLLDataset(config.train_filename)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.chars_filename)
예제 #11
0
    get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
    export_trimmed_glove_vectors, get_processing_word

# Create instance of config
config = Config()

processing_word = get_processing_word(lowercase=True)

# Generators
dev   = CoNLLDataset(config.filename_dev, processing_word)
test  = CoNLLDataset(config.filename_test, processing_word)
train = CoNLLDataset(config.filename_train, processing_word)

# Build Word and Tag vocab
vocab_words, vocab_tags = get_vocabs([train, dev, test])
vocab_glove = get_glove_vocab(config.filename_glove)
vocab = vocab_words & vocab_glove
vocab.add(config.UNK)
vocab.add(config.NUM)

# Save vocab
write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)

# Trim GloVe Vectors
vocab = load_vocab(config.filename_words)
export_trimmed_glove_vectors(vocab, config.filename_glove,
                            config.filename_trimmed, config.dim_word)

# Build and save char vocab
train = CoNLLDataset(config.filename_train)
예제 #12
0
vocab_words = set()
vocab_tags = set()
vocab_chars = set()
file = open('data/all.txt')
for line in file:
	line = line.strip()
	if len(line) == 0:
		continue
	token, tag = line.split(' ')
	print token, tag
	for c in token:
		vocab_chars.add(c)
	vocab_words.add(token)
	vocab_tags.add(tag)

# Build Word and Tag vocab
vocab_glove = get_glove_vocab(config.glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocabs
write_vocab(vocab, config.words_filename)
write_vocab(vocab_tags, config.tags_filename)
write_vocab(vocab_chars, config.chars_filename)

# Trim GloVe Vectors
vocab = load_vocab(config.words_filename)
export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
예제 #13
0
glove_filename = "{}/glove.6B.{}d.txt".format(glove_dir, dim_word)
# trimmed embeddings (created from glove_filename with build_data.py)
filename_trimmed = "{}/glove.6B.{}d.trimmed.npz".format(output_dir, dim_word)

words_filename = "{}/words.txt".format(output_dir)
tags_filename = "{}/tags.txt".format(output_dir)
chars_filename = "{}/chars.txt".format(output_dir)

processing_word = get_processing_word(lowercase=True)

train = CoNLLDataset(train_filename, processing_word)
valid = CoNLLDataset(valid_filename, processing_word)

# Build word and tag vocabs
vocab_words, vocab_tags = get_vocabs([train, valid])
vocab_glove = get_glove_vocab(glove_filename)

vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)

# Save vocab
write_vocab(vocab, words_filename)
write_vocab(vocab_tags, tags_filename)

# Trim GloVe Vectors
vocab = load_vocab(words_filename)
export_trimmed_glove_vectors(vocab, glove_filename, filename_trimmed, dim_word)

# Build and save char vocab
train = CoNLLDataset(train_filename)