Python VocabBuilder.filter примеры использования

Язык программирования: Python

Пространство имен/Пакет: msp.data

Класс/Тип: VocabBuilder

Метод/Функция: filter

Примеров на hotexamples.com: 3

Python VocabBuilder.filter - 3 примера найдено. Это лучшие примеры Python кода для msp.data.VocabBuilder.filter, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

VocabBuilder(7)

finish(7)

feed_stream(6)

feed_one(5)

build_from_stream(4)

has_key_currently(4)

filter(3)

finish_thresh(3)

Пример #1

Показать файл

Файл: vocab.py Проект: ValentinaPy/zmsp

 def build_from_stream(dconf: DConf, stream, extra_stream):
     zlog("Build vocabs from streams.")
     ret = ParserVocabPackage({}, {}, dconf)
     #
     word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     label_builder = VocabBuilder("label")
     word_normer = ret.word_normer
     if dconf.vocab_add_prevalues:
         zlog(
             f"Add pre-defined values for upos({len(ParserVocabPackage.PRE_VALUES_UPOS)}) and "
             f"ulabel({len(ParserVocabPackage.PRE_VALUES_ULAB)}).")
         pos_builder.feed_stream(ParserVocabPackage.PRE_VALUES_UPOS)
         label_builder.feed_stream(ParserVocabPackage.PRE_VALUES_ULAB)
     for inst in stream:
         # todo(warn): only do special handling for words
         # there must be words
         word_builder.feed_stream(word_normer.norm_stream(inst.words.vals))
         for w in inst.words.vals:
             char_builder.feed_stream(w)
         # pos and label can be optional
         if inst.poses.has_vals():
             pos_builder.feed_stream(inst.poses.vals)
         if inst.labels.has_vals():
             label_builder.feed_stream(inst.labels.vals)
     #
     w2vec = None
     if dconf.init_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set()
         for inst in extra_stream:
             for w in word_normer.norm_stream(inst.words.vals):
                 extra_word_set.add(w)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide dconf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = dconf.pretrain_file, dconf.code_pretrain
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(lambda ww, rank, val:
                             (val >= dconf.word_fthres and rank <= dconf.
                              word_rthres) or w2vec.has_key(ww))
         # then add extra ones
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres,
                                                 fthres=dconf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     # todo(+1): extra pos/label symbols?
     TARGET_END = VocabHelper.convert_special_pattern("unk")
     pos_vocab = pos_builder.finish(
         target_range=(1, TARGET_END))  # only real tags
     label_vocab = label_builder.finish(target_range=(1, TARGET_END))
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("label", label_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret

Пример #2

Показать файл

Файл: vocab.py Проект: ValentinaPy/zmsp

 def build_from_stream(conf: OverallConf, stream, extra_stream):
     dconf = conf.dconf
     zlog("Build vocabs from streams.")
     ret = IEVocabPackage({}, {}, dconf)
     # here, collect them all
     # -- basic inputs
     word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     lemma_builder = VocabBuilder("lemma")
     upos_builder = VocabBuilder("upos")
     ulabel_builder = VocabBuilder("ulabel")
     # -- outputs (event type, entity/filler type, arg role type) (type -> count)
     evt_type_builder = defaultdict(int)
     ef_type_builder = defaultdict(int)
     arg_role_builder = defaultdict(int)
     for inst in stream:
         # -- basic inputs
         for sent in inst.sents:
             word_builder.feed_stream(sent.words.vals)
             for w in sent.words.vals:
                 char_builder.feed_stream(w)
             lemma_builder.feed_stream(sent.lemmas.vals)
             upos_builder.feed_stream(sent.uposes.vals)
             ulabel_builder.feed_stream(sent.ud_labels.vals)
         # -- outputs
         # assert inst.entity_fillers is not None, "For building vocabs, need to provide training instances!"
         assert inst.events is not None, "For building vocabs, need to provide training instances!"
         if inst.entity_fillers is not None:
             for one_ef in inst.entity_fillers:
                 ef_type_builder[one_ef.type] += 1
         for one_evt in inst.events:
             evt_type_builder[one_evt.type] += 1
             if one_evt.links is not None:
                 for one_arg in one_evt.links:
                     arg_role_builder[one_arg.role] += 1
     # build real hlabel-types
     hl_evt = HLabelVocab("event", conf.mconf.hl_evt, evt_type_builder)
     hl_ef = HLabelVocab("entity_filler", conf.mconf.hl_ef, ef_type_builder)
     hl_arg = HLabelVocab("arg", conf.mconf.hl_arg, arg_role_builder)
     # deal with pre-trained word embeddings
     w2vec = None
     if dconf.init_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         # collect extra words and lemmas
         extra_word_set = set()
         extra_lemma_set = set()
         for inst in extra_stream:
             for sent in inst.sents:
                 for w in sent.words.vals:
                     extra_word_set.add(w)
                 for w in sent.lemmas.vals:
                     extra_lemma_set.add(w)
         # must provide dconf.pretrain_file
         w2vec = WordVectors.load(dconf.pretrain_file)
         # first filter according to thresholds
         word_builder.filter(lambda ww, rank, val:
                             (val >= dconf.word_fthres and rank <= dconf.
                              word_rthres) or w2vec.has_key(ww))
         lemma_builder.filter(lambda ww, rank, val:
                              (val >= dconf.word_fthres and rank <= dconf.
                               word_rthres) or w2vec.has_key(ww))
         # then add extra ones
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         for w in extra_lemma_set:
             if w2vec.has_key(w) and (
                     not lemma_builder.has_key_currently(w)):
                 lemma_builder.feed_one(w)
         # finially build the vocab and embeds
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
         lemma_vocab = lemma_builder.finish()
         lemma_embed1 = lemma_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
         # first build pool-embeds, the final decision will depend on each of the flags
         # todo(WARN): assert all hit?
         hl_evt_pembed = hl_evt.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         hl_ef_pembed = hl_ef.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         hl_arg_pembed = hl_arg.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         # by-product of filtered output pre-trained embeddings for later faster processing
         if dconf.output_pretrain_filter:
             w2vec.save_hits(dconf.output_pretrain_filter)
     else:
         word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres,
                                                 fthres=dconf.word_fthres)
         lemma_vocab = lemma_builder.finish_thresh(rthres=dconf.word_rthres,
                                                   fthres=dconf.word_fthres)
         word_embed1 = lemma_embed1 = None
         #
         for one_cc in [
                 conf.mconf.hl_evt, conf.mconf.hl_ef, conf.mconf.hl_arg
         ]:
             if hasattr(one_cc, "pool_init_hint"):
                 assert not one_cc.pool_init_hint, "cannot init pool because the overall pre-train-init flag is not set"
         hl_evt_pembed = hl_ef_pembed = hl_arg_pembed = None
     char_vocab = char_builder.finish()
     upos_vocab = upos_builder.finish()
     ulabel_vocab = ulabel_builder.finish()
     # =====
     # finally assign things
     ret.put_voc("word", word_vocab)
     ret.put_voc("lemma", lemma_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("upos", upos_vocab)
     ret.put_voc("ulabel", ulabel_vocab)
     ret.put_emb("word", word_embed1)
     ret.put_emb("lemma", lemma_embed1)
     # don't need to be jsonable since we are using pickle all at once
     # todo(WARN): the conf in vocab is also stored!!
     ret.put_voc("hl_evt", hl_evt)
     ret.put_voc("hl_ef", hl_ef)
     ret.put_voc("hl_arg", hl_arg)
     ret.put_emb("hl_evt", hl_evt_pembed)
     ret.put_emb("hl_ef", hl_ef_pembed)
     ret.put_emb("hl_arg", hl_arg_pembed)
     return ret

Пример #3

Показать файл

Файл: vocab.py Проект: ValentinaPy/zmsp

 def build_from_stream(build_conf: MLMVocabPackageConf, stream,
                       extra_stream):
     zlog("Build vocabs from streams.")
     ret = MLMVocabPackage({}, {})
     # -----
     if build_conf.add_ud2_pos_backoffs:
         ud2_pos_pre_list = list(VocabBuilder.DEFAULT_PRE_LIST) + [
             UD2_POS_UNK_MAP[p] for p in UD2_POS_LIST
         ]
         word_builder = VocabBuilder("word", pre_list=ud2_pos_pre_list)
     else:
         word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     deplabel_builder = VocabBuilder("deplabel")
     ner_builder = VocabBuilder("ner")
     if build_conf.add_ud2_prevalues:
         zlog(
             f"Add pre-defined UD2 values for upos({len(UD2_POS_LIST)}) and ulabel({len(UD2_LABEL_LIST)})."
         )
         pos_builder.feed_stream(UD2_POS_LIST)
         deplabel_builder.feed_stream(UD2_LABEL_LIST)
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
         for w in inst.word_seq.vals:
             char_builder.feed_stream(w)
         # todo(+N): currently we are assuming that we are using UD pos/deps, and directly go with the default ones
         # pos and label can be optional??
         # if inst.poses.has_vals():
         #     pos_builder.feed_stream(inst.poses.vals)
         # if inst.deplabels.has_vals():
         #     deplabel_builder.feed_stream(inst.deplabels.vals)
         if hasattr(inst, "ner_seq") and inst.ner_seq.has_vals():
             ner_builder.feed_stream(inst.ner_seq.vals)
     # ===== embeddings
     w2vec = None
     if build_conf.read_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide build_conf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = build_conf.pretrain_file, build_conf.pretrain_codes
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(
             lambda ww, rank, val: (val >= build_conf.word_fthres and rank
                                    <= build_conf.word_rthres) or
             (build_conf.ignore_thresh_with_pretrain and w2vec.has_key(ww)))
         # then add extra ones
         if build_conf.ignore_thresh_with_pretrain:
             for w in extra_word_set:
                 if w2vec.has_key(w) and (
                         not word_builder.has_key_currently(w)):
                     word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=build_conf.pretrain_init_nohit,
             scale=build_conf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(
             rthres=build_conf.word_rthres, fthres=build_conf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     pos_vocab = pos_builder.finish(sort_by_count=False)
     deplabel_vocab = deplabel_builder.finish(sort_by_count=False)
     ner_vocab = ner_builder.finish()
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("deplabel", deplabel_vocab)
     ret.put_voc("ner", ner_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret