def main(): s = TextReader("./test_utils.py") vb = VocabBuilder("w") for one in s: vb.feed_stream(one.tokens) v = vb.finish() pass
def aug_word2_vocab(self, stream, extra_stream, extra_embed_file: str): zlog( f"Aug another word vocab from streams and extra_embed_file={extra_embed_file}" ) word_builder = VocabBuilder("word2") for inst in stream: word_builder.feed_stream(inst.word_seq.vals) # embeddings if len(extra_embed_file) > 0: extra_word_set = set(w for inst in extra_stream for w in inst.word_seq.vals) w2vec = WordVectors.load(extra_embed_file) for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) word_vocab = word_builder.finish() # no filtering!! word_embed1 = word_vocab.filter_embed(w2vec, init_nohit=1.0, scale=1.0) else: zwarn("WARNING: No pretrain file for aug node!!") word_vocab = word_builder.finish() # no filtering!! word_embed1 = None self.put_voc("word2", word_vocab) self.put_emb("word2", word_embed1)
def prepare_test(args, ConfType=None): # conf conf: OverallConf = init_everything(args, ConfType) dconf, mconf = conf.dconf, conf.mconf iconf = mconf.iconf # vocab vpack = IEVocabPackage.build_by_reading(conf) # prepare data test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.use_label0, dconf.noef_link0, dconf.aux_repr_test, max_evt_layers=dconf.max_evt_layers) # model model = build_model(conf.model_type, conf, vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ===== # augment with extra embeddings extra_embed_files = dconf.test_extra_pretrain_files if len(extra_embed_files) > 0: # get embeddings extra_codes = [] # todo(note): ignore this mode for this project if len(extra_codes) == 0: extra_codes = [""] * len(extra_embed_files) extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0]) extra_embedding.merge_others([ WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in zip( extra_embed_files[1:], extra_codes[1:]) ]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = VocabBuilder.build_from_stream(iter_hit_words( test_streamer, extra_embedding), sort_by_count=True, pre_list=(), post_list=()) # give them to the model new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding) vpack.put_voc("word", new_vocab) # ===== # use bert? todo(note): no pre-compute here in testing! if dconf.use_bert: bmodel = get_berter(dconf.bconf) test_streamer = BerterDataAuger(test_streamer, bmodel, "aux_repr") # # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter = batch_stream( index_stream(test_streamer, vpack, False, False, test_inst_preparer), iconf, False) return conf, model, vpack, test_iter
def prepare_test(args, ConfType=None): # conf conf = init_everything(args, ConfType) dconf, mconf = conf.dconf, conf.mconf # vocab vpack = MLMVocabPackage.build_by_reading(dconf.dict_dir) # prepare data test_streamer = PreprocessStreamer(get_data_reader(dconf.test, dconf.input_format), lower_case=dconf.lower_case, norm_digit=dconf.norm_digit) # model model = build_model(conf, vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ----- # augment with extra embeddings for test stream? extra_embed_files = dconf.vconf.test_extra_pretrain_files if len(extra_embed_files) > 0: # get embeddings extra_codes = dconf.vconf.test_extra_pretrain_codes if len(extra_codes) == 0: extra_codes = [""] * len(extra_embed_files) extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0]) extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in zip(extra_embed_files[1:], extra_codes[1:])]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding), sort_by_count=True, pre_list=(), post_list=()) # give them to the model new_vocab = aug_words_and_embs(model, extra_vocab, extra_embedding) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) backoff_pos_idx = dconf.backoff_pos_idx test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer, backoff_pos_idx), mconf.test_batch_size, mconf, False) return conf, model, vpack, test_iter
def prepare_test(args, ConfType=None): # conf conf = init_everything(args, ConfType) dconf, pconf = conf.dconf, conf.pconf iconf = pconf.iconf # vocab vpack = ParserVocabPackage.build_by_reading(dconf) # prepare data test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.code_test, dconf.use_label0, dconf.aux_repr_test, dconf.aux_score_test) # model model = build_model(conf.partype, conf, vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ===== # augment with extra embeddings extra_embed_files = dconf.test_extra_pretrain_files if len(extra_embed_files) > 0: # get embeddings extra_codes = dconf.test_extra_pretrain_codes if len(extra_codes) == 0: extra_codes = [""] * len(extra_embed_files) extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0]) extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in zip(extra_embed_files[1:], extra_codes[1:])]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding), sort_by_count=True, pre_list=(), post_list=()) # give them to the model new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer), iconf, False) return conf, model, vpack, test_iter
def build_from_stream(build_conf: MLMVocabPackageConf, stream, extra_stream): zlog("Build vocabs from streams.") ret = MLMVocabPackage({}, {}) # ----- if build_conf.add_ud2_pos_backoffs: ud2_pos_pre_list = list(VocabBuilder.DEFAULT_PRE_LIST) + [ UD2_POS_UNK_MAP[p] for p in UD2_POS_LIST ] word_builder = VocabBuilder("word", pre_list=ud2_pos_pre_list) else: word_builder = VocabBuilder("word") char_builder = VocabBuilder("char") pos_builder = VocabBuilder("pos") deplabel_builder = VocabBuilder("deplabel") ner_builder = VocabBuilder("ner") if build_conf.add_ud2_prevalues: zlog( f"Add pre-defined UD2 values for upos({len(UD2_POS_LIST)}) and ulabel({len(UD2_LABEL_LIST)})." ) pos_builder.feed_stream(UD2_POS_LIST) deplabel_builder.feed_stream(UD2_LABEL_LIST) for inst in stream: word_builder.feed_stream(inst.word_seq.vals) for w in inst.word_seq.vals: char_builder.feed_stream(w) # todo(+N): currently we are assuming that we are using UD pos/deps, and directly go with the default ones # pos and label can be optional?? # if inst.poses.has_vals(): # pos_builder.feed_stream(inst.poses.vals) # if inst.deplabels.has_vals(): # deplabel_builder.feed_stream(inst.deplabels.vals) if hasattr(inst, "ner_seq") and inst.ner_seq.has_vals(): ner_builder.feed_stream(inst.ner_seq.vals) # ===== embeddings w2vec = None if build_conf.read_from_pretrain: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs extra_word_set = set(w for inst in extra_stream for w in inst.word_seq.vals) # ----- load (possibly multiple) pretrain embeddings # must provide build_conf.pretrain_file (there can be multiple pretrain files!) list_pretrain_file, list_code_pretrain = build_conf.pretrain_file, build_conf.pretrain_codes list_code_pretrain.extend( [""] * len(list_pretrain_file)) # pad default ones w2vec = WordVectors.load(list_pretrain_file[0], aug_code=list_code_pretrain[0]) if len(list_pretrain_file) > 1: w2vec.merge_others([ WordVectors.load(list_pretrain_file[i], aug_code=list_code_pretrain[i]) for i in range(1, len(list_pretrain_file)) ]) # ----- # first filter according to thresholds word_builder.filter( lambda ww, rank, val: (val >= build_conf.word_fthres and rank <= build_conf.word_rthres) or (build_conf.ignore_thresh_with_pretrain and w2vec.has_key(ww))) # then add extra ones if build_conf.ignore_thresh_with_pretrain: for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) word_vocab = word_builder.finish() word_embed1 = word_vocab.filter_embed( w2vec, init_nohit=build_conf.pretrain_init_nohit, scale=build_conf.pretrain_scale) else: word_vocab = word_builder.finish_thresh( rthres=build_conf.word_rthres, fthres=build_conf.word_fthres) word_embed1 = None # char_vocab = char_builder.finish() pos_vocab = pos_builder.finish(sort_by_count=False) deplabel_vocab = deplabel_builder.finish(sort_by_count=False) ner_vocab = ner_builder.finish() # assign ret.put_voc("word", word_vocab) ret.put_voc("char", char_vocab) ret.put_voc("pos", pos_vocab) ret.put_voc("deplabel", deplabel_vocab) ret.put_voc("ner", ner_vocab) ret.put_emb("word", word_embed1) # return ret
def to_zvoc(self) -> Vocab: builder = VocabBuilder("word", default_val=0) for w, c in self.w2c.items(): builder.feed_one(w, c) voc: Vocab = builder.finish(sort_by_count=True) return voc