def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_mor_tags = load_vocab(config.mor_tags_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_lex_tags = load_vocab(config.lex_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) cnn_model = CnnLstmCrfModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) cnn_model.build() cnn_model.write_tag_result_test(vocab_tags, processing_word, processing_mor_tag, processing_lex_tag)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_mor_tags = load_vocab(config.mor_tags_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_lex_tags = load_vocab(config.lex_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = Data(config.dev_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) test = Data(config.test_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) train = Data(config.train_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) cnn_model = CnnLstmCrfModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) cnn_model.build() cnn_model.train(train, dev, vocab_tags) cnn_model.evaluate(test, vocab_tags)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False, allow_unk=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
def main(config): # load vocabs vocab_words, idx2words = load_vocab(config.words_filename) vocab_tags, _ = load_vocab(config.tags_filename) vocab_chars, _ = load_vocab(config.chars_filename) vocab_pos, _ = load_vocab(config.pos_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(vocab_pos, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename) pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename) NE_dic = get_trimmed_glove_vectors(config.trimmed_dic) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model model = NERModel(config, embeddings, embeddings_uni, pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words, NE_dic=NE_dic) model.build() # train, evaluate and interact if state == "train": model.train(train, dev, vocab_tags) elif state == "evaluate": model.evaluate(dev, vocab_tags) else: #state == predict convert(file) t2o("data_format/test_convert.txt","data_format/test.txt") test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) model.evaluate(test, vocab_tags) tagging("data_format/test_convert.txt")
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) processing_word = get_processing_word(lowercase=True) # clean data train_filepath, dev_filepath_a = write_clear_data( config.train_filename, build_dev=config.build_dev_from_trainset, dev_ratio=config.dev_ratio) test_filepath, dev_filepath_b = write_clear_data( config.test_filename, build_dev=config.build_dev_from_testset, dev_ratio=config.dev_ratio) dev_filepath = dev_filepath_a or dev_filepath_b # Generators dev = Dataset(dev_filepath, processing_word) test = Dataset(test_filepath, processing_word) train = Dataset(train_filepath, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = Dataset(train_filepath) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = AnnotationDataset(config.dev_filename, processing_word) test = AnnotationDataset(config.test_filename, processing_word) train = AnnotationDataset(config.train_filename, processing_word) print("Num. train: %d" % len(train)) print("Num. test: %d" % len(test)) print("Num. dev: %d" % len(dev)) model = WImpModel(config, embeddings, ntags=config.nclass, nchars=len(vocab_chars)) # build WImpModel model.build_graph() # train, evaluate and interact model.train(train, dev) model.evaluate(test)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
def rec(sentence): try: processing_word = get_processing_word(nlu.vocab_words, lowercase=config.lowercase) # print character_separation(sentence)[0] words_raw = character_separation(sentence)[0].split(' ') # for word in words_raw: # if type(word)==str: words_raw = [unicode(word, 'utf-8') for word in words_raw] # words_raw = [word.decode('utf-8') for word in words_raw] # else: # words_raw = [unicode(word, 'utf-8') for word in words_raw] words = map(processing_word, words_raw) words = list(words) pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words]) preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0])) # print(list(preds)) print_sentence(nlu.model.logger, {"x": words_raw, "y": preds}) return list(preds) except EOFError: print("Closing session.") # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花')
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) #test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev]) vocab_glove = get_glove_vocab(config.glove_filename) vocab_glove_uni = get_glove_vocab(config.glove_uni_filename) vocab_feature = get_pos_glove_vocab(config.glove_filename) # vocab = vocab_words & vocab_glove vocab = vocab_glove | vocab_words vocab.add(UNK) vocab.add(NUM) vocab_pos = vocab_feature vocab_pos.add(UNK) vocab_pos.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_glove_uni, config.uni_words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_pos, config.pos_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.t_dim) vocab = load_vocab(config.uni_words_filename) export_trimmed_uni_vectors(vocab, config.NEdic_filename, config.trimmed_dic, config.dic_dim) export_trimmed_uni_vectors(vocab, config.glove_uni_filename, config.uni_trimmed_filename, config.dim) vocab_feature = load_vocab(config.pos_filename) export_trimmed_pos_vectors(vocab_feature, config.glove_feature, config.feature_trimmed_filename, config.pos_dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def build_joint_vocab(config): # Common options for all datasets processing_word = get_processing_word(lowercase=True) vocab_glove = get_glove_vocab(config.filename_glove) # Compute and save individual vocab v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word, vocab_glove) v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word, vocab_glove) v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word, vocab_glove) print(" *** Joint vocabulary ***") vocab_words = v1_words.union(v2_words, v3_words) vocab_chars = v1_chars.union(v2_chars, v3_chars) # Save combined vocab write_vocab(vocab_words, config.filename_words) write_vocab(vocab_chars, config.filename_chars) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word)
def load(self): self.vocab_tags = load_vocab(self.filename_tags) self.processing_tag = get_processing_word(self.vocab_tags, lowercase=False, allow_unk=False) self.ntags = len(self.vocab_tags) self.early_stop_metric_sign = -1 if self.stop_direction == 'increase' else 1
def build_data(config): processing_word = get_processing_word() dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_poss, config.poss_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_pref_suff = load_vocab( config.PS_filename) ############### For prefix and suffix vocab_pref_suff_2 = load_vocab(config.PS_filename_2) vocab_pref_suff_4 = load_vocab(config.PS_filename_4) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, vocab_pref_suff, vocab_pref_suff_2, vocab_pref_suff_4, lowercase=True, chars=config.chars, Pref_Suff=config.pref_suff) processing_tag = get_processing_word(vocab_tags, lowercase=False, Geoparser=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) ##create dataset dev = CoNLLDataset( config.dev_filename, processing_word, ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index processing_tag, config.max_iter ) ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_iob = {"O": 0, "B": 1, "I": 2} vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3} # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=4) model.build() # train, evaluate and interact print vocab_tags model.train(train, dev, vocab_tags) stime = time.time() model.evaluate(test, vocab_tags) print time.time() - stime
def build_data(config, logger): """ Procedure to build data """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators test = CoNLLDataset(config.test_filename, processing_word) dev = CoNLLDataset(config.dev_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab print("Build Word and Tag vocab...") vocab_words, vocab_poss, vocab_chunks, \ vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test]) vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab print("Dealing words vocab...") write_vocab(vocab, config.words_filename) print("Dealing poss vocab...") write_vocab(vocab_poss, config.poss_filename) vocab_chunks = [tags for tags in vocab_chunks] if "NO" in vocab_chunks: vocab_chunks.remove("NO") vocab_chunks.insert(0, "NO") else: logger.error(">>> vocab_chunks used as mpqa has something wrong!") print("Dealing chunks vocab...") write_vocab(vocab_chunks, config.chunks_filename) vocab_aspect_tags = [tags for tags in vocab_aspect_tags] vocab_aspect_tags.remove("O") vocab_aspect_tags.insert(0, "O") vocab_polarity_tags = [tags for tags in vocab_polarity_tags] vocab_polarity_tags.remove("O") vocab_polarity_tags.insert(0, "O") vocab_joint_tags = [tags for tags in vocab_joint_tags] vocab_joint_tags.remove("O") vocab_joint_tags.insert(0, "O") print("Dealing aspect_tags vocab...") write_vocab(vocab_aspect_tags, config.aspect_tags_filename) print("Dealing polarity_tags vocab...") write_vocab(vocab_polarity_tags, config.polarity_tags_filename) print("Dealing joint_tags vocab...") write_vocab(vocab_joint_tags, config.joint_tags_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.domain_filename, config.domain_trimmed_filename, config.dim_domain) export_trimmed_glove_vectors(vocab, config.general_filename, config.general_trimmed_filename, config.dim_general)
def test_processing_words_with_words_idx_dict_and_allow_unknow(): d = dict() d['娃哈哈'] = 1 d['#####'] = 3 d['<UNK>'] = 0 processing_word = get_processing_word(d, True) word1 = processing_word("娃哈哈") word2 = processing_word("12345") word3 = processing_word("xixihehe") print(word1, word2, word3)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) dictionary = load_vocab("data/types.txt") types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()]) vocab_iob = {"O":0, "B":1, "I":2} vocab_type = load_vocab(config.types_filename) print vocab_type # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) ntype = len(vocab_type) model = POSmodel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=ntype) model.build() model.train(train, dev, vocab_type) model.evaluate(test, vocab_type)
def load(self): #load vocab dictionary self.vocab_words = load_dict(self.f_words) self.vocab_tags = load_dict(self.f_tags) self.vocab_chars = load_dict(self.f_chars) self.num_word = len(self.vocab_words) self.num_tag = len(self.vocab_tags) self.num_char = len(self.vocab_chars) #processing to map string to id self.processing_word = get_processing_word(self.vocab_words, self.vocab_chars, lowercase=True, chars=self.use_chars) self.processing_tag = get_processing_word(self.vocab_tags, lowercase=False, allow_unk=False) #pretrained embedding self.embbedings = (processing_trimmed_glove_vector(self.f_trimmed) if self.use_pretrained else None)
def build_data(config, logger): """ Procedure to build data """ # Generators processing_word = get_processing_word(lowercase=config.lowercase) test = CoNLLDataset(config.test_filename, processing_word) dev = CoNLLDataset(config.dev_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab logger.info("Build Word and Tag vocab...") vocab_words, vocab_poss, vocab_chunks, vocab_tags = get_vocabs( [train, dev, test]) vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) vocab_tags = [tags for tags in vocab_tags] vocab_tags.remove("O") vocab_tags.insert(0, "O") write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab logger.info("Build chars vocab...") train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save Depstree processing_relation = get_processing_relation() dev_deps = DepsDataset(config.dev_deps_filename, processing_word, processing_relation) train_deps = DepsDataset(config.train_deps_filename, processing_word, processing_relation) logger.info("Build relations vocab...") vocab_relations = get_relations_vocabs([train_deps, dev_deps]) vocab_relations.add(UNK) write_vocab(vocab_relations, config.relations_filename)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab.add(PAD) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename, processing_word) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename) # Build and save type vocab vocab_types = set() print len(vocab_tags) for tag in vocab_tags: if tag != 'O': vocab_types.add(tag[2:]) write_vocab(vocab_types, config.types_filename)
def test_dataset(): # test getDataset and get_vocabs processing_word = get_processing_word() dev = getDataset("../data/test_ner.txt", processing_word) vocab_words, vocab_tags = get_vocabs([dev]) # get common vocab from dev file and polyglot vocab_poly = get_polyglot_vocab("../data/polyglot-zh.pkl") vocab = vocab_words & vocab_poly vocab.add(UNK) write_vocab(vocab, "../data/words.txt") write_vocab(vocab_tags, "../data/tags.txt") vocab = load_vocab("../data/words.txt") export_trimmed_polyglot_vectors(vocab, "../polyglot-zh.pkl", "../data/polyglot.trimmed.npz", 64) data = get_trimmed_polyglot_vectors("../data/polyglot.trimmed.npz")
def load(self): """Loads vocabulary, processing functions and embeddings """ # 1. vocabulary self.vocab_words = load_vocab(self.filename_words) self.vocab_chars = load_vocab(self.filename_chars) self.nwords = len(self.vocab_words) self.nchars = len(self.vocab_chars) # 2. get processing functions that map str -> id self.processing_word = get_processing_word(self.vocab_words, self.vocab_chars, lowercase=True, chars=self.use_chars) # 3. get pre-trained embeddings self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) if self.use_pretrained else None)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(pretrained_embeddings_file=None, filtered_embeddings_file="data/filtered_embeddings.txt"): words_file = "data/words.txt" tags_file = "data/tags.txt" chars_file = "data/chars.txt" test_file = 'data/eng.testa' train_file = 'data/eng.train' processing_word = get_processing_word(lowercase=False) test = CoNLLDataset(test_file, processing_word) train = CoNLLDataset(train_file, processing_word) vocab_words, vocab_tags = get_vocabs([train, test]) vocab = set(vocab_words) if pretrained_embeddings_file: embedding_vocab = get_embedding_vocab(pretrained_embeddings_file) vocab &= embedding_vocab print('{} overlapping words'.format(len(vocab))) vocab.add(UNK) vocab.add(NUM) vocab = list(vocab) # TODO: there's probably no need for these anymore, check and remove, if this is the case vocab.insert(TOKEN2IDX[PAD], PAD) vocab.insert(TOKEN2IDX[START_TAG], START_TAG) vocab.insert(TOKEN2IDX[STOP_TAG], STOP_TAG) print(len(vocab)) write_vocab(vocab, words_file) write_vocab(vocab_tags, tags_file) if pretrained_embeddings_file: filter_embeddings_in_vocabulary(words_file, pretrained_embeddings_file, filtered_embeddings_file) vocab_chars = get_char_vocab(vocab_words) write_vocab(vocab_chars, chars_file)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word() # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word)
# load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_morphs = load_vocab(config.morphs_filename) #morphs add vocab_syls = load_vocab(config.word_syl_filename) pos_tags = load_vocab(config.posTag_filename) #pos tag adding---- dic_words = load_vocab(config.word_dic_filename) #dic add # get processing functions processing_word = get_processing_word(vocab_words, dic_words, vocab_chars, vocab_morphs, vocab_syls, pos_tags, lowercase=True, chars=config.chars, morphs=config.morphs, posflag=config.posTag, pos_lm=config.posLM, dic_flag=config.dic_flag) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(pos_tags=pos_tags, posflag=True, lowercase=True, pos_lm=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) dic_embeddings = get_exported_dic_vectors(config.exported_filename) morph_embeddings = get_exported_morph_vectors(config.exported_mfilename)
from data_utils import get_trimmed_glove_vectors, load_vocab, \ get_processing_word, CoNLLDataset from model import NERModel from config import Config # create instance of config config = Config() # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model
def build_data(config): annotations = [] meta_filename = 'sw%s%s-ms98-a-trans.text' # % (file_id, speaker_id) for idx in os.listdir(config.wimp_corpus): idx_path = os.path.join(config.wimp_corpus, idx) if os.path.isfile(idx_path): continue for file_id in os.listdir(idx_path): folder = os.path.join(idx_path, file_id) if os.path.isfile(folder): continue wimp_trans_files = [ os.path.join(folder, meta_filename % (file_id, 'A')), os.path.join(folder, meta_filename % (file_id, 'B')) ] swd_trans_files = [ os.path.join(config.swd_transcripts, idx, file_id, meta_filename % (file_id, 'A')), os.path.join(config.swd_transcripts, idx, file_id, meta_filename % (file_id, 'B')) ] for i, wimp_trans_file in enumerate(wimp_trans_files): swd_trans_file = swd_trans_files[i] file_id, speaker = swd_trans_file.split("/")[-2:] speaker = speaker[6] with open(wimp_trans_file) as w_file_obj, open( swd_trans_file) as s_file_obj: for line_num, (anns_, wrds_) in enumerate( zip(w_file_obj, s_file_obj)): sentence = [] anns = anns_.strip().split(' ')[3:] wrds = wrds_.strip().split(' ')[3:] assert(len(anns) == len(wrds)), \ "file mismatch, line %d : %s and %s" % (line_num, swd_trans_file, wimp_trans_file) for id_, wrd in enumerate(wrds): wrd = clean_word(wrd) if wrd != '': sentence.append([(file_id, line_num, speaker), wrd, float(anns[id_])]) if len(sentence) != 0: annotations.append(sentence) random.shuffle(annotations) #80% for training, 10% dev, 10% test d_train = annotations[:0.8 * len(annotations)] d_test = annotations[0.8 * len(annotations):0.9 * len(annotations)] d_dev = annotations[0.9 * len(annotations):] def prep_text_data(D, outfile): with open(outfile, 'w') as f: for sent in D: for _, word, label in sent: f.write("%s %f\n" % (word, label)) f.write("\n") prep_text_data(d_train, config.train_filename) prep_text_data(d_test, config.test_filename) prep_text_data(d_dev, config.dev_filename) processing_word = get_processing_word(lowercase=True) # Generators dev = AnnotationDataset(config.dev_filename, processing_word) test = AnnotationDataset(config.test_filename, processing_word) train = AnnotationDataset(config.train_filename, processing_word) # Build Word and Tag vocab # Vocabulary is built using training data vocab_words, vocab_tags = get_vocabs([train]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = AnnotationDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
from data_utils import get_trimmed_glove_vectors, load_vocab, \ get_processing_word, CoNLLDataset from general_utils import get_logger from model import NERModel from config import config # directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, lowercase=config.lowercase) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # get logger
import numpy as np import os import tensorflow as tf from config import Config from sklearn.model_selection import train_test_split from dataobject import CoNLLDataset from data_utils import get_vocabs, UNK, NUM, \ get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \ export_trimmed_glove_vectors, get_processing_word # Create instance of config config = Config() processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(config.UNK) vocab.add(config.NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev, test]) #pos adding----- vocab_glove = get_glove_vocab(config.glove_filename) vocab_dic = get_dic_vocab(config.dic_filename, 1) #add dic vector get vocab_syl = get_dic_vocab(config.syl_filename, 1) #add syl vector vocab_morph = get_morph_vocab(config.morph_vec_filename) #morph vector get vocab = vocab_words & vocab_glove vocab.add(UNK.decode('utf-8')) vocab.add(NUM.decode('utf-8')) word_dic = vocab_dic #add dic word_dic.add(UNK.decode('utf-8')) word_dic.add(NUM.decode('utf-8')) word_syl = vocab_syl #add syl word_syl.add(UNK.decode('utf-8')) word_syl.add(NUM.decode('utf-8')) word_morph = vocab_morph # add morph word_morph.add(UNK.decode('utf-8')) word_morph.add(NUM.decode('utf-8')) vocab_pos.add(UNK.decode('utf-8')) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(word_dic, config.word_dic_filename) #add dic write_vocab(word_syl, config.word_syl_filename) #add syl write_vocab(word_morph, config.morphs_filename) #add morph write_vocab(vocab_pos, config.posTag_filename) #add pos # Trim GloVe Vectors(pretrain vector) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) word_dic = load_vocab(config.word_dic_filename) #dic add export_dic_vectors(word_dic, config.dic_filename, config.exported_filename, config.dic_dim) word_syl = load_vocab(config.word_syl_filename) #syl add export_syl_vectors(word_syl, config.syl_filename, config.exported_sfilename, config.syl_dim) word_morph = load_vocab(config.morphs_filename) #morph add export_morph_vectors(word_morph, config.morph_vec_filename, config.exported_mfilename, config.dim_morph) vocab_pos = load_vocab(config.posTag_filename) #pos add export_pos_vectors(vocab_pos, config.pos_vec_filename, config.exported_pfilename, config.dim_pos) # Build and save char vocab, morph vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)