def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) #test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev]) vocab_glove = get_glove_vocab(config.glove_filename) vocab_glove_uni = get_glove_vocab(config.glove_uni_filename) vocab_feature = get_pos_glove_vocab(config.glove_filename) # vocab = vocab_words & vocab_glove vocab = vocab_glove | vocab_words vocab.add(UNK) vocab.add(NUM) vocab_pos = vocab_feature vocab_pos.add(UNK) vocab_pos.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_glove_uni, config.uni_words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_pos, config.pos_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.t_dim) vocab = load_vocab(config.uni_words_filename) export_trimmed_uni_vectors(vocab, config.NEdic_filename, config.trimmed_dic, config.dic_dim) export_trimmed_uni_vectors(vocab, config.glove_uni_filename, config.uni_trimmed_filename, config.dim) vocab_feature = load_vocab(config.pos_filename) export_trimmed_pos_vectors(vocab_feature, config.glove_feature, config.feature_trimmed_filename, config.pos_dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_joint_vocab(config): # Common options for all datasets processing_word = get_processing_word(lowercase=True) vocab_glove = get_glove_vocab(config.filename_glove) # Compute and save individual vocab v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word, vocab_glove) v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word, vocab_glove) v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word, vocab_glove) print(" *** Joint vocabulary ***") vocab_words = v1_words.union(v2_words, v3_words) vocab_chars = v1_chars.union(v2_chars, v3_chars) # Save combined vocab write_vocab(vocab_words, config.filename_words) write_vocab(vocab_chars, config.filename_chars) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word)
def build_data(config): processing_word = get_processing_word() dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_poss, config.poss_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) processing_word = get_processing_word(lowercase=True) # clean data train_filepath, dev_filepath_a = write_clear_data( config.train_filename, build_dev=config.build_dev_from_trainset, dev_ratio=config.dev_ratio) test_filepath, dev_filepath_b = write_clear_data( config.test_filename, build_dev=config.build_dev_from_testset, dev_ratio=config.dev_ratio) dev_filepath = dev_filepath_a or dev_filepath_b # Generators dev = Dataset(dev_filepath, processing_word) test = Dataset(test_filepath, processing_word) train = Dataset(train_filepath, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = Dataset(train_filepath) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab.add(PAD) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename, processing_word) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save type vocab vocab_types = set() print len(vocab_tags) for tag in vocab_tags: if tag != 'O': vocab_types.add(tag[2:]) write_vocab(vocab_types, config.types_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(): # get config config = Config(load=False) # Generators train = get_datasets(config.filename_train) valid = get_datasets(config.filename_valid) test = get_datasets(config.filename_test) # add <start> to glove # add_glove(config.filename_glove, config.dim_word) # Build word vocab train_words = get_train_vocab(train) glove_vocab = get_glove_vocab(config.filename_glove) # train & glove(word to index) vocab = word2index(train_words, glove_vocab) # save vocab write_vocab(config.filename_words, vocab) # index to word index = index2word(vocab) write_vocab(config.filename_index, index) # embedding glove_embedding(config.filename_glove, config.filename_trimmed_glove, config.dim_word, vocab, config.start, config.pad) # trim datasets get_trimmed_datasets(config.filename_trimmed_train, train, vocab, config.max_length) get_trimmed_datasets(config.filename_trimmed_valid, valid, vocab, config.max_length) get_trimmed_datasets(config.filename_trimmed_test, test, vocab, config.max_length)
def build_data(config): annotations = [] meta_filename = 'sw%s%s-ms98-a-trans.text' # % (file_id, speaker_id) for idx in os.listdir(config.wimp_corpus): idx_path = os.path.join(config.wimp_corpus, idx) if os.path.isfile(idx_path): continue for file_id in os.listdir(idx_path): folder = os.path.join(idx_path, file_id) if os.path.isfile(folder): continue wimp_trans_files = [ os.path.join(folder, meta_filename % (file_id, 'A')), os.path.join(folder, meta_filename % (file_id, 'B')) ] swd_trans_files = [ os.path.join(config.swd_transcripts, idx, file_id, meta_filename % (file_id, 'A')), os.path.join(config.swd_transcripts, idx, file_id, meta_filename % (file_id, 'B')) ] for i, wimp_trans_file in enumerate(wimp_trans_files): swd_trans_file = swd_trans_files[i] file_id, speaker = swd_trans_file.split("/")[-2:] speaker = speaker[6] with open(wimp_trans_file) as w_file_obj, open( swd_trans_file) as s_file_obj: for line_num, (anns_, wrds_) in enumerate( zip(w_file_obj, s_file_obj)): sentence = [] anns = anns_.strip().split(' ')[3:] wrds = wrds_.strip().split(' ')[3:] assert(len(anns) == len(wrds)), \ "file mismatch, line %d : %s and %s" % (line_num, swd_trans_file, wimp_trans_file) for id_, wrd in enumerate(wrds): wrd = clean_word(wrd) if wrd != '': sentence.append([(file_id, line_num, speaker), wrd, float(anns[id_])]) if len(sentence) != 0: annotations.append(sentence) random.shuffle(annotations) #80% for training, 10% dev, 10% test d_train = annotations[:0.8 * len(annotations)] d_test = annotations[0.8 * len(annotations):0.9 * len(annotations)] d_dev = annotations[0.9 * len(annotations):] def prep_text_data(D, outfile): with open(outfile, 'w') as f: for sent in D: for _, word, label in sent: f.write("%s %f\n" % (word, label)) f.write("\n") prep_text_data(d_train, config.train_filename) prep_text_data(d_test, config.test_filename) prep_text_data(d_dev, config.dev_filename) processing_word = get_processing_word(lowercase=True) # Generators dev = AnnotationDataset(config.dev_filename, processing_word) test = AnnotationDataset(config.test_filename, processing_word) train = AnnotationDataset(config.train_filename, processing_word) # Build Word and Tag vocab # Vocabulary is built using training data vocab_words, vocab_tags = get_vocabs([train]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = AnnotationDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev, test]) #pos adding----- vocab_glove = get_glove_vocab(config.glove_filename) vocab_dic = get_dic_vocab(config.dic_filename, 1) #add dic vector get vocab_syl = get_dic_vocab(config.syl_filename, 1) #add syl vector vocab_morph = get_morph_vocab(config.morph_vec_filename) #morph vector get vocab = vocab_words & vocab_glove vocab.add(UNK.decode('utf-8')) vocab.add(NUM.decode('utf-8')) word_dic = vocab_dic #add dic word_dic.add(UNK.decode('utf-8')) word_dic.add(NUM.decode('utf-8')) word_syl = vocab_syl #add syl word_syl.add(UNK.decode('utf-8')) word_syl.add(NUM.decode('utf-8')) word_morph = vocab_morph # add morph word_morph.add(UNK.decode('utf-8')) word_morph.add(NUM.decode('utf-8')) vocab_pos.add(UNK.decode('utf-8')) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(word_dic, config.word_dic_filename) #add dic write_vocab(word_syl, config.word_syl_filename) #add syl write_vocab(word_morph, config.morphs_filename) #add morph write_vocab(vocab_pos, config.posTag_filename) #add pos # Trim GloVe Vectors(pretrain vector) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) word_dic = load_vocab(config.word_dic_filename) #dic add export_dic_vectors(word_dic, config.dic_filename, config.exported_filename, config.dic_dim) word_syl = load_vocab(config.word_syl_filename) #syl add export_syl_vectors(word_syl, config.syl_filename, config.exported_sfilename, config.syl_dim) word_morph = load_vocab(config.morphs_filename) #morph add export_morph_vectors(word_morph, config.morph_vec_filename, config.exported_mfilename, config.dim_morph) vocab_pos = load_vocab(config.posTag_filename) #pos add export_pos_vectors(vocab_pos, config.pos_vec_filename, config.exported_pfilename, config.dim_pos) # Build and save char vocab, morph vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \ export_trimmed_glove_vectors, get_processing_word # Create instance of config config = Config() processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(config.UNK) vocab.add(config.NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train)
vocab_words = set() vocab_tags = set() vocab_chars = set() file = open('data/all.txt') for line in file: line = line.strip() if len(line) == 0: continue token, tag = line.split(' ') print token, tag for c in token: vocab_chars.add(c) vocab_words.add(token) vocab_tags.add(tag) # Build Word and Tag vocab vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocabs write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_chars, config.chars_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
glove_filename = "{}/glove.6B.{}d.txt".format(glove_dir, dim_word) # trimmed embeddings (created from glove_filename with build_data.py) filename_trimmed = "{}/glove.6B.{}d.trimmed.npz".format(output_dir, dim_word) words_filename = "{}/words.txt".format(output_dir) tags_filename = "{}/tags.txt".format(output_dir) chars_filename = "{}/chars.txt".format(output_dir) processing_word = get_processing_word(lowercase=True) train = CoNLLDataset(train_filename, processing_word) valid = CoNLLDataset(valid_filename, processing_word) # Build word and tag vocabs vocab_words, vocab_tags = get_vocabs([train, valid]) vocab_glove = get_glove_vocab(glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, words_filename) write_vocab(vocab_tags, tags_filename) # Trim GloVe Vectors vocab = load_vocab(words_filename) export_trimmed_glove_vectors(vocab, glove_filename, filename_trimmed, dim_word) # Build and save char vocab train = CoNLLDataset(train_filename)