Пример #1
0
def main():
    s = TextReader("./test_utils.py")
    vb = VocabBuilder("w")
    for one in s:
        vb.feed_stream(one.tokens)
    v = vb.finish()
    pass
Пример #2
0
 def aug_word2_vocab(self, stream, extra_stream, extra_embed_file: str):
     zlog(
         f"Aug another word vocab from streams and extra_embed_file={extra_embed_file}"
     )
     word_builder = VocabBuilder("word2")
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
     # embeddings
     if len(extra_embed_file) > 0:
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         w2vec = WordVectors.load(extra_embed_file)
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         word_vocab = word_builder.finish()  # no filtering!!
         word_embed1 = word_vocab.filter_embed(w2vec,
                                               init_nohit=1.0,
                                               scale=1.0)
     else:
         zwarn("WARNING: No pretrain file for aug node!!")
         word_vocab = word_builder.finish()  # no filtering!!
         word_embed1 = None
     self.put_voc("word2", word_vocab)
     self.put_emb("word2", word_embed1)
Пример #3
0
def prepare_test(args, ConfType=None):
    # conf
    conf: OverallConf = init_everything(args, ConfType)
    dconf, mconf = conf.dconf, conf.mconf
    iconf = mconf.iconf
    # vocab
    vpack = IEVocabPackage.build_by_reading(conf)
    # prepare data
    test_streamer = get_data_reader(dconf.test,
                                    dconf.input_format,
                                    dconf.use_label0,
                                    dconf.noef_link0,
                                    dconf.aux_repr_test,
                                    max_evt_layers=dconf.max_evt_layers)
    # model
    model = build_model(conf.model_type, conf, vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_files
    if len(extra_embed_files) > 0:
        # get embeddings
        extra_codes = []  # todo(note): ignore this mode for this project
        if len(extra_codes) == 0:
            extra_codes = [""] * len(extra_embed_files)
        extra_embedding = WordVectors.load(extra_embed_files[0],
                                           aug_code=extra_codes[0])
        extra_embedding.merge_others([
            WordVectors.load(one_file,
                             aug_code=one_code) for one_file, one_code in zip(
                                 extra_embed_files[1:], extra_codes[1:])
        ])
        # get extra dictionary (only those words hit in extra-embed)
        extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(
            test_streamer, extra_embedding),
                                                     sort_by_count=True,
                                                     pre_list=(),
                                                     post_list=())
        # give them to the model
        new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding)
        vpack.put_voc("word", new_vocab)
    # =====
    # use bert? todo(note): no pre-compute here in testing!
    if dconf.use_bert:
        bmodel = get_berter(dconf.bconf)
        test_streamer = BerterDataAuger(test_streamer, bmodel, "aux_repr")
    #
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter = batch_stream(
        index_stream(test_streamer, vpack, False, False, test_inst_preparer),
        iconf, False)
    return conf, model, vpack, test_iter
Пример #4
0
def prepare_test(args, ConfType=None):
    # conf
    conf = init_everything(args, ConfType)
    dconf, mconf = conf.dconf, conf.mconf
    # vocab
    vpack = MLMVocabPackage.build_by_reading(dconf.dict_dir)
    # prepare data
    test_streamer = PreprocessStreamer(get_data_reader(dconf.test, dconf.input_format),
                                       lower_case=dconf.lower_case, norm_digit=dconf.norm_digit)
    # model
    model = build_model(conf, vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # -----
    # augment with extra embeddings for test stream?
    extra_embed_files = dconf.vconf.test_extra_pretrain_files
    if len(extra_embed_files) > 0:
        # get embeddings
        extra_codes = dconf.vconf.test_extra_pretrain_codes
        if len(extra_codes) == 0:
            extra_codes = [""] * len(extra_embed_files)
        extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0])
        extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in
                                      zip(extra_embed_files[1:], extra_codes[1:])])
        # get extra dictionary (only those words hit in extra-embed)
        extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding),
                                                     sort_by_count=True, pre_list=(), post_list=())
        # give them to the model
        new_vocab = aug_words_and_embs(model, extra_vocab, extra_embedding)
        vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    backoff_pos_idx = dconf.backoff_pos_idx
    test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer, backoff_pos_idx),
                             mconf.test_batch_size, mconf, False)
    return conf, model, vpack, test_iter
Пример #5
0
def prepare_test(args, ConfType=None):
    # conf
    conf = init_everything(args, ConfType)
    dconf, pconf = conf.dconf, conf.pconf
    iconf = pconf.iconf
    # vocab
    vpack = ParserVocabPackage.build_by_reading(dconf)
    # prepare data
    test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.code_test, dconf.use_label0,
                                    dconf.aux_repr_test, dconf.aux_score_test)
    # model
    model = build_model(conf.partype, conf, vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_files
    if len(extra_embed_files) > 0:
        # get embeddings
        extra_codes = dconf.test_extra_pretrain_codes
        if len(extra_codes) == 0:
            extra_codes = [""] * len(extra_embed_files)
        extra_embedding = WordVectors.load(extra_embed_files[0], aug_code=extra_codes[0])
        extra_embedding.merge_others([WordVectors.load(one_file, aug_code=one_code) for one_file, one_code in
                                      zip(extra_embed_files[1:], extra_codes[1:])])
        # get extra dictionary (only those words hit in extra-embed)
        extra_vocab = VocabBuilder.build_from_stream(iter_hit_words(test_streamer, extra_embedding),
                                                     sort_by_count=True, pre_list=(), post_list=())
        # give them to the model
        new_vocab = model.aug_words_and_embs(extra_vocab, extra_embedding)
        vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter = batch_stream(index_stream(test_streamer, vpack, False, False, test_inst_preparer), iconf, False)
    return conf, model, vpack, test_iter
Пример #6
0
 def build_from_stream(build_conf: MLMVocabPackageConf, stream,
                       extra_stream):
     zlog("Build vocabs from streams.")
     ret = MLMVocabPackage({}, {})
     # -----
     if build_conf.add_ud2_pos_backoffs:
         ud2_pos_pre_list = list(VocabBuilder.DEFAULT_PRE_LIST) + [
             UD2_POS_UNK_MAP[p] for p in UD2_POS_LIST
         ]
         word_builder = VocabBuilder("word", pre_list=ud2_pos_pre_list)
     else:
         word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     deplabel_builder = VocabBuilder("deplabel")
     ner_builder = VocabBuilder("ner")
     if build_conf.add_ud2_prevalues:
         zlog(
             f"Add pre-defined UD2 values for upos({len(UD2_POS_LIST)}) and ulabel({len(UD2_LABEL_LIST)})."
         )
         pos_builder.feed_stream(UD2_POS_LIST)
         deplabel_builder.feed_stream(UD2_LABEL_LIST)
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
         for w in inst.word_seq.vals:
             char_builder.feed_stream(w)
         # todo(+N): currently we are assuming that we are using UD pos/deps, and directly go with the default ones
         # pos and label can be optional??
         # if inst.poses.has_vals():
         #     pos_builder.feed_stream(inst.poses.vals)
         # if inst.deplabels.has_vals():
         #     deplabel_builder.feed_stream(inst.deplabels.vals)
         if hasattr(inst, "ner_seq") and inst.ner_seq.has_vals():
             ner_builder.feed_stream(inst.ner_seq.vals)
     # ===== embeddings
     w2vec = None
     if build_conf.read_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide build_conf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = build_conf.pretrain_file, build_conf.pretrain_codes
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(
             lambda ww, rank, val: (val >= build_conf.word_fthres and rank
                                    <= build_conf.word_rthres) or
             (build_conf.ignore_thresh_with_pretrain and w2vec.has_key(ww)))
         # then add extra ones
         if build_conf.ignore_thresh_with_pretrain:
             for w in extra_word_set:
                 if w2vec.has_key(w) and (
                         not word_builder.has_key_currently(w)):
                     word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=build_conf.pretrain_init_nohit,
             scale=build_conf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(
             rthres=build_conf.word_rthres, fthres=build_conf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     pos_vocab = pos_builder.finish(sort_by_count=False)
     deplabel_vocab = deplabel_builder.finish(sort_by_count=False)
     ner_vocab = ner_builder.finish()
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("deplabel", deplabel_vocab)
     ret.put_voc("ner", ner_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret
Пример #7
0
 def to_zvoc(self) -> Vocab:
     builder = VocabBuilder("word", default_val=0)
     for w, c in self.w2c.items():
         builder.feed_one(w, c)
     voc: Vocab = builder.finish(sort_by_count=True)
     return voc