def load(self): """Loads vocabulary, processing functions and embeddings Supposes that build_data.py has been run successfully and that the corresponding files have been created (vocab and trimmed GloVe vectors) """ # 1. vocabulary self.vocab_words = load_vocab(self.filename_words) self.vocab_tags = load_vocab(self.filename_tags) self.vocab_chars = load_vocab(self.filename_chars) self.nwords = len(self.vocab_words) self.nchars = len(self.vocab_chars) self.ntags = len(self.vocab_tags) # 2. get processing functions that map str -> id self.processing_word = get_processing_word(self.vocab_words, self.vocab_chars, lowercase=True, chars=self.use_chars) self.processing_tag = get_processing_word(self.vocab_tags, lowercase=False, allow_unk=False) # 3. get pre-trained embeddings self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) if self.use_pretrained else None)
def main(): # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) processing_pos = get_processing_word() processing_chunk = get_processing_word() # Generators dev = CoNLLDataset(config.filename_dev, processing_word, processing_pos, processing_chunk) test = CoNLLDataset(config.filename_test, processing_word, processing_pos, processing_chunk) train = CoNLLDataset(config.filename_train, processing_word, processing_pos, processing_chunk) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_poses, vocab_chunks = get_vocabs( [train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = [i for i in vocab_words if i in vocab_glove] vocab.append(UNK) vocab.append(NUM) vocab.append("$pad$") vocab_poses.append("$pad$") vocab_chunks.append("$pad$") vocab_tags.append("$pad$") # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) write_vocab(vocab_poses, config.filename_poses) write_vocab(vocab_chunks, config.filename_chunks) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) print(len(vocab)) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) vocab = load_vocab(config.filename_poses) export_trimed_ont_hot_vectors(vocab, config.filename_pos_trimmed) vocab = load_vocab(config.filename_chunks) export_trimed_ont_hot_vectors(vocab, config.filename_chunk_trimmed) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) vocab_chars.append("$pad$") write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # 把字符全部小写,数字替换成NUM # Generators to_be_add = CoNLLDataset1(config.filename_test, processing_word) # 返回一句话(words),和标签tags # Build Word and Tag vocab vocab_words, _ = get_vocabs([to_be_add]) vocab_glove = get_glove_vocab(config.filename_glove) # glove词表 words_have_vec = vocab_words & vocab_glove vocab_words_and_entity = entity2vocab(datasets=[to_be_add], vocab=words_have_vec) vocab_in_file = set(load_vocab(config.filename_words)) vocab_words_to_be_add = vocab_words_and_entity - vocab_in_file if len(vocab_words_to_be_add) != 0: with open(config.filename_words, 'a') as f: for i, vocab_word in enumerate(vocab_words_to_be_add): f.write('\n{}'.format(vocab_word)) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) # 得到dict类型的vocab:{word:index} # 针对vocab,生成numpy的embedding文件,包含一个矩阵,对应词嵌入 export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word)
def main(): # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # 把字符全部小写,数字替换成NUM # Generators dev = CoNLLDataset(config.filename_dev, processing_word) # 创建一个生成器对象,每一次迭代产生tuple (words,tags) test = CoNLLDataset(config.filename_test, processing_word) # 返回一句话(words),和标签tags train = CoNLLDataset(config.filename_train, processing_word) #进一步处理数据 # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) # word词表, tags表 print(len(vocab_words)) vocab_glove = get_glove_vocab(config.filename_glove) # glove词表 vocab = vocab_words & vocab_glove # & 求交集 set,都是集合 vocab.add(UNK) vocab.add(NUM) # 手动添加 print("len of vocab without entity: ", len(vocab)) # vocab_entity = entity2vocab(datasets=[train, dev, test]) # vocab.update(vocab_entity) # vocab = entity2vocab(datasets=[train, dev], vocab=vocab) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) # 得到dict类型的vocab:{word:index} # 针对vocab,生成numpy的embedding文件,包含一个矩阵,对应词嵌入 export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab 生成字母表, 这里没用到小写化的东西。只有文件本身。 train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word, task=config.task) test = CoNLLDataset(config.filename_test, processing_word, task=config.task) train = CoNLLDataset(config.filename_train, processing_word, task=config.task) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) #TODO get word2vec vocab too vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save word and tag vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # write and trim GloVe and word2vec Vectors vocab = load_vocab(config.filename_words) write_word2vec_to_txtfile(config.path_to_word2vec_bin_file, config.filename_word2vec) export_trimmed_word2vec_vectors(vocab, config.filename_word2vec, config.trimmed_word2vec_filename, config.dim_word) export_trimmed_glove_vectors(vocab, config.filename_glove, config.trimmed_glove_filename, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) pw_function = get_processing_word(lowercase=True) # Generators dev = Dataset(config.filename_dev, processing_word=pw_function) test = Dataset(config.filename_test, processing_word=pw_function) train = Dataset(config.filename_train, processing_word=pw_function) # Build Words vocab_words = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab = list(vocab) pronouns_in_vocab = move_pronouns(vocab) write_vocab(pronouns_in_vocab, config.filename_pronouns) # add START, STOP, PAD, UNK and NUM tokens into the list add_special_tokens(vocab) assert PAD_TOKEN == vocab[0] assert UNKNOWN_TOKEN in vocab # Save vocab write_vocab(vocab, config.filename_words) # Trim GloVe Vectors vocab, _ = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = Dataset(config.filename_train) vocab_chars = get_char_vocab(train) vocab_chars = list(vocab_chars) vocab_chars.insert(0, PAD_TOKEN) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) if config.task == 'pos': print("USING POS") config.filename_train = "data/train.pos" # test config.filename_dev = "data/dev.pos" config.filename_test = "data/test.pos" else: print("USING NER") processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) # 与glove中的词集合求交,只保留有向量的那些词 vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab, vocab: set() print("write vocab set to file: " + config.filename_words) write_vocab(vocab, config.filename_words) print("write vocab tags set to file: " + config.filename_tags) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors, 只加载那些在词集合中出现过的词向量 vocab_to_index_dict = load_vocab(config.filename_words) # vocab: dict, vocab[word] = word_index print("export trimmed vocab embedding to file: " + config.filename_trimmed) export_trimmed_glove_vectors(vocab_to_index_dict, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) print("save char set to file:" + config.filename_chars) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab (only from train!) vocab_words, vocab_freqs, vocab_tags = get_vocabs([train]) #, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove #vocab = make_unks(vocab, vocab_freqs, config.p_unk) #vocab.add(UNK) vocab.add(NUM) vocab = [UNK] + list(vocab) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # get singletons singletons = [k for k, v in vocab_freqs.items() if v == 1] write_vocab(singletons, config.filename_singletons) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): # get config and processing of words config = Config(load=False) # should be source_x.txt # or ontonotes-nw if you like config.filename_train = "../datasets/ritter2011/train" config.filename_dev = "../datasets/ritter2011/dev" config.filename_test = "../datasets/ritter2011/test" config.filename_chars = config.filename_chars.replace("source", "target") config.filename_glove = config.filename_glove.replace("source", "target") config.filename_tags = config.filename_tags.replace("source", "target") config.filename_words = config.filename_words.replace("source", "target") config.dir_model = config.dir_model.replace("source", "target") config.dir_output = config.dir_output.replace("source", "target") config.path_log = config.path_log.replace("source", "target") processing_word = get_processing_word(lowercase=True) # Generators dev = NERDataset(config.filename_dev, processing_word) test = NERDataset(config.filename_test, processing_word) train = NERDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab_tags.add(UNK) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim Word Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = NERDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(parser, load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = Dataset(config.filename_dev, processing_word) test = Dataset(config.filename_test, processing_word) train = Dataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) # vocab_glove = get_wordvec_vocab(config.filename_wordvec) # vocab = vocab_words & vocab_glove vocab = list(vocab_words) vocab.insert(0, UNK) vocab.append(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) print('Wrote vocab') # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_wordvec_vectors(vocab, config.filename_wordvec, config.filename_wordvec_trimmed) print('trimmed vocab')
def main(): config = Config(load=False) processing_word = data_utils.get_processing_word(lowercase=True) #Datasets test = Dataset(config.filename_test, processing_word=processing_word) dev = Dataset(config.filename_dev, processing_word=processing_word) train = Dataset(config.filename_train, processing_word=processing_word) # Vocab Generators vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_fasttext = get_fasttext_vocab(config.filename_fasttext) #Build Word and Tag Vocab if config.use_fasttext_oov_vector_gen: vocab = vocab_words else: vocab = vocab_words & vocab_fasttext vocab.add(UNK) vocab.add(NUM) oov_words = vocab_words - vocab_fasttext generate_fasttext_oov_vectors(oov_words, config.filename_oov_words, config.filename_oov_result_vectors) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) #Trim and (insert new) Fasttext vectors word_to_idx, idx_to_word = load_vocab(config.filename_words) export_trimmed_fasttext_vectors(word_to_idx, idx_to_word, config.filename_fasttext, config.filename_fasttext_trimmed, config.dim_word, config.filename_oov_result_vectors, config.use_fasttext_oov_vector_gen) # Build and save char vocab train = Dataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): # get config and processing of words config = Config(load=False) # should be source_x.txt # or ontonotes-nw if you like config.filename_train = "../datasets/ontonotes-nw/train" config.filename_dev = "../datasets/ontonotes-nw/dev" config.filename_test = "../datasets/ontonotes-nw/test" processing_word = get_processing_word(lowercase=True) # Generators dev = NERDataset(config.filename_dev, processing_word) test = NERDataset(config.filename_test, processing_word) train = NERDataset(config.filename_train, processing_word) #for word, tag in train: #print("word:{}".format(word)) #print ("tag:{}".format(tag)) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab_tags.add(UNK) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim Word Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = NERDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): # get config and processing of words config = Config(load=False) # or ontonotes-nw if you like assert sys.argv[1] in datasets, "the source argument should be in {c/o/r/w}" source_dataset = sys.argv[1] source_vocab_words, source_vocab_tags = get_vocabs_from_dataset(source_dataset) print("Source word vocab size:", len(source_vocab_words)) assert sys.argv[2] in datasets, "the target argument should be in {c/o/r/w}" target_dataset = sys.argv[2] target_vocab_words, _ = get_vocabs_from_dataset(target_dataset) print("Target word vocab size:", len(target_vocab_words)) # Build Word and Tag vocab config.filename_words = "../datasets/%s/words.txt"%datasets[source_dataset] config.filename_tags = "../datasets/%s/tags.txt"%datasets[source_dataset] config.filename_chars = "../datasets/%s/chars.txt"%datasets[source_dataset] print("Source+Target word vocab size:", len((source_vocab_words | target_vocab_words))) vocab_glove = get_glove_vocab(config.filename_glove) vocab = (source_vocab_words | target_vocab_words) & vocab_glove vocab.add(UNK) vocab.add(NUM) print("Final word vocab size:", len(vocab)) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(source_vocab_tags, config.filename_tags) # Build and save char vocab vocab_chars = get_char_vocab((source_vocab_words | target_vocab_words)) print("Final char vocab size:", len(vocab_chars)) write_vocab(vocab_chars, config.filename_chars) # Trim Word Vectors vocab = load_vocab(config.filename_words) config.filename_trimmed = config.filename_trimmed.replace("dataset_name",datasets[source_dataset]) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant polyglot vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word() # Generators dev = getDataset(config.filename_dev, processing_word) test = getDataset(config.filename_test, processing_word) train = getDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_poly = get_polyglot_vocab(config.filename_polyglot) # Get common vocab vocab = vocab_words & vocab_poly vocab.add(UNK) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim Polygloe Vectors vocab = load_vocab(config.filename_words) export_trimmed_polyglot_vectors(vocab, config.filename_polyglot, \ config.filename_trimmed, config.dim)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) # Build word and tag vocabs vocab_words, vocab_tags = CoNLLDataset( [config.filename_dev, config.filename_train, config.filename_test], processing_word(lowercase=True)).get_word_tag_vocabs() vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove | {UNK, NUM} # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab vocab_chars = CoNLLDataset(config.filename_train).get_char_vocab() write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data This procedure iterates over the SemEval dataset and builds a vocabulary of words and tags, then writes them to a file. Each word is labelled by an ID. The GloVe vectors of the words are then extracted and stored in a numpy array. The word id is used to index into that numpy array. """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators for the dev, test and training files dev = GloveDataset(config.filename_dev, processing_word) test = GloveDataset(config.filename_test, processing_word) train = GloveDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) #find the intersection between the vocabs from the chosen dataset and GloVe vocab = vocab_words & vocab_glove #adds the unknown and numeric value to the vocab vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # export the trimmed glove vectors in a compressed file. vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = FKDataset(config.filename_dev, processing_word) test1 = FKDataset(config.filename_test1, processing_word) test2 = FKDataset(config.filename_test2, processing_word) train = FKDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test1,test2]) vocab_glove = get_glove_vocab(config.filename_glove) #print ("Inside build data and prinitng vocab_tags") vocab_tags_task1 =[] vocab_tags_task2 =[] for items in vocab_tags: if "_dress" in items: vocab_tags_task1.append(items) if "_jean" in items: vocab_tags_task2.append(items) vocab_tags_task1.append('O') vocab_tags_task2.append('O') vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = FKDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
filename_chars = "working_dir/chars.txt" # build data (just to test model) build_data( filename_dev, filename_test, filename_train, [300], filename_words, filename_words_ext, filename_tags, filename_chars, filename_word="../pretrained_vectors/vecs_{}.txt", filename_word_vec_trimmed="../pretrained_vectors/vecs_{}.trimmed.npz", which_tags=which_tags) vocab_words = load_vocab(filename_words) vocab_tags = load_vocab(filename_tags) vocab_chars = load_vocab(filename_chars) nwords = len(vocab_words) nchars = len(vocab_chars) ntags = len(vocab_tags) # load data processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=use_chars) processing_tag = get_processing_word(vocab_tags, lowercase=False, allow_unk=False) X_dev, y_dev = coNLLDataset_full(filename_dev, processing_word,
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ if len(sys.argv)<2: sys.stderr.write("Too few arguments have been specified\n") sys.stderr.write("python "+sys.argv[0]+" config [additional vocabulary in conll format]\n") sys.exit(0) # get config and processing of words config_file = sys.argv[1] config = Config(config_file,load=False) processing_word = get_processing_word(config) # processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) #add additional tags/vocabulary where the data is applied to! if len(sys.argv)>2: for i in range(2,len(sys.argv)): wo,tg = get_vocabs([CoNLLDataset(sys.argv[i],processing_word)]) vocab_words |= wo vocab_tags |= tg #if config.use_pretrained: # vocab_glove = get_vocab(config.filename_embeddings) #if config.use_pretrained: # vocab = vocab_words & vocab_glove #else: vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) if config.use_pretrained: export_trimmed_embedding_vectors(vocab, config.filename_embeddings, config.filename_embeddings_trimmed, config.dim_word, config.embedding_type) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) logger = config.logger #------------------------------------------------------------------ # Generators # ------------------------------------------------------------------ dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) sick = CoNLLDataset(config.filename_sick, processing_word) # ------------------------------------------------------------------ # Build Word and Tag vocab # ------------------------------------------------------------------ vocab_words, vocab_tags = get_vocabs([train, dev, test, sick]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # ------------------------------------------------------------------ # Save vocab # ------------------------------------------------------------------ write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # ------------------------------------------------------------------ # Trim GloVe Vectors # ------------------------------------------------------------------ vocab, _ = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # ------------------------------------------------------------------ # Build and save char vocab # ------------------------------------------------------------------ train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars) # ------------------------------------------------------------------ #split train files # ------------------------------------------------------------------ logger.info('\n Splitting the train file into {} splits ...'.format( config.num_splits)) split_train(config) logger.info('Saved the train splits in {}'.format('ner/data/'))
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words dir_output = "./results/" + sys.argv[4] + "/" config = Config(dir_output, load=False) processing_word = get_processing_word(lowercase=True) # Generators #dev = CoNLLDataset(config.filename_dev, processing_word) #test = CoNLLDataset(config.filename_test, processing_word) #train = CoNLLDataset(config.filename_train, processing_word) dev = CoNLLDataset(sys.argv[1], processing_word) test = CoNLLDataset(sys.argv[2], processing_word) train = CoNLLDataset(sys.argv[3], processing_word) config.filename_dev = sys.argv[1] config.filename_test = sys.argv[2] config.filename_train = sys.argv[3] config.filename_pred = sys.argv[2].replace(".txt", ".pred") config.filename_words = "./data/words_" + sys.argv[4] + ".txt" config.filename_chars = "./data/chars_" + sys.argv[4] + ".txt" config.filename_tags = "./data/tags_" + sys.argv[4] + ".txt" # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab.add(LG) vocab.add(ENT) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant word2vec vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=False) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) train2 = CoNLLDataset(config.filename_train2, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test, train2]) vocab = vocab_words if "w2v" in config.use_pretrained: vocab_word2vec = get_word_vec_vocab(config.filename_word2vec) vocab = vocab_words & vocab_word2vec if config.use_pretrained == "w2v" else vocab_words if config.replace_digits: vocab.add(NUM) vocab.add(UNK) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim FastText vectors if "ft" in config.use_pretrained: abs_f_words = os.path.abspath(config.filename_words) abs_f_vec = os.path.abspath(config.filename_fasttext) cmd = config.get_ft_vectors_cmd.format(abs_f_words, abs_f_vec) subprocess.check_call(cmd, shell=True) vocab = load_vocab(config.filename_words) export_trimmed_word_vectors(vocab, config.filename_fasttext, config.filename_trimmed_ft, config.dim_word) if "s2v" in config.use_pretrained: abs_s_words = os.path.abspath(config.filename_words) abs_s_vec = os.path.abspath(config.filename_fasttext) cmd = config.get_sent2vec_vectors_cmd.format(abs_s_words, abs_s_vec) subprocess.check_call(cmd, shell=True) vocab = load_vocab(config.filename_words) export_trimmed_word_vectors(vocab, config.filename_sent2vec, config.filename_trimmed_s2v, config.dim_sent) # Trim Morph2Vec vectors if "m2v" in config.use_pretrained: vocab = load_vocab(config.filename_words) export_trimmed_word_vectors(vocab, config.filename_morph2vec, config.filename_trimmed_m2v, config.dim_morph, partial_match=True) # Trim word2vec Vectors if "w2v" in config.use_pretrained: vocab = load_vocab(config.filename_words) export_trimmed_word_vectors(vocab, config.filename_word2vec, config.filename_trimmed_w2v, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train, config.use_ortho_char) write_vocab(vocab_chars, config.filename_chars)
def generate_model_data(data_prefix=None): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words # loads PubMeda articles config = Config(load=False) print('Config') processing_word = get_processing_word(lowercase=True) print('Processing_word') # Generators if data_prefix: cwd = os.getcwd() config.filename_dev = os.path.join( cwd, 'data', data_prefix + '_' + os.path.basename(config.filename_dev)) config.filename_test = os.path.join( cwd, 'data', data_prefix + '_' + os.path.basename(config.filename_test)) config.filename_train = os.path.join( cwd, 'data', data_prefix + '_' + os.path.basename(config.filename_train)) if not os.path.isfile(config.filename_dev): print('Preprocessing tokens and labels to generate input data files') preprocess_data() dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) print('Loaded dev, test, train') # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) print('Loading vocab_words') vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='conll2003') parser.add_argument('--train_lang', type=str, default='en') parser.add_argument('--dev_lang', type=str, default='en') parser.add_argument('--test_lang', type=str, default='en') parser.add_argument('--src_glove', type=str, default='data/glove.42B.300d.txt') parser.add_argument('--tgt_glove', type=str, default=None) parser.add_argument('--emb_dim', type=int, default=300) parser.add_argument('--trimmed_glove', type=str, default='glove_trimmed.npz') #parser.add_argument('--init_char', type=str, default=0) #parser.add_argument('--trimmed_char', type=str, default='char_trimmed.npz') args = parser.parse_args() # get config and processing of words #config = Config(emb_dim=512, load=False, dataset='ner_nl_es', use_muse=True) processing_word = get_processing_word(lowercase=True) #src_lang = 'nl' #tgt_lang = 'es' data_dir = args.dataset # Generators dev = CoNLLDataset(os.path.join(data_dir, 'dev.txt'), processing_word=processing_word, lang=args.dev_lang) test = CoNLLDataset(os.path.join(data_dir, 'test.txt'), processing_word=processing_word, lang=args.test_lang) train = CoNLLDataset(os.path.join(data_dir, 'train.txt'), processing_word=processing_word, lang=args.train_lang) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(args.src_glove, lang=args.train_lang) if args.tgt_glove: vocab_glove_tgt = get_glove_vocab(args.tgt_glove, lang=args.test_lang) vocab = vocab_words & (vocab_glove | vocab_glove_tgt) else: vocab = vocab_words & vocab_glove #vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, os.path.join(data_dir, 'words.txt')) write_vocab(vocab_tags, os.path.join(data_dir, 'tags.txt')) # Trim GloVe Vectors vocab = load_vocab(os.path.join(data_dir, 'words.txt')) if args.tgt_glove: gloves = { args.train_lang: args.src_glove, args.test_lang: args.tgt_glove } else: gloves = {args.train_lang: args.src_glove} export_trimmed_glove_vectors_multiple( vocab, gloves, os.path.join(data_dir, args.trimmed_glove), args.emb_dim) # Build and save char vocab train = CoNLLDataset(os.path.join(data_dir, 'train.txt')) test = CoNLLDataset(os.path.join(data_dir, 'test.txt')) dev = CoNLLDataset(os.path.join(data_dir, 'dev.txt')) vocab_chars = get_char_vocab([train, test, dev]) write_vocab(vocab_chars, os.path.join(data_dir, 'chars.txt'))