Пример #1
0
 def build_from_stream(dconf: DConf, stream, extra_stream):
     zlog("Build vocabs from streams.")
     ret = ParserVocabPackage({}, {}, dconf)
     #
     word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     label_builder = VocabBuilder("label")
     word_normer = ret.word_normer
     if dconf.vocab_add_prevalues:
         zlog(
             f"Add pre-defined values for upos({len(ParserVocabPackage.PRE_VALUES_UPOS)}) and "
             f"ulabel({len(ParserVocabPackage.PRE_VALUES_ULAB)}).")
         pos_builder.feed_stream(ParserVocabPackage.PRE_VALUES_UPOS)
         label_builder.feed_stream(ParserVocabPackage.PRE_VALUES_ULAB)
     for inst in stream:
         # todo(warn): only do special handling for words
         # there must be words
         word_builder.feed_stream(word_normer.norm_stream(inst.words.vals))
         for w in inst.words.vals:
             char_builder.feed_stream(w)
         # pos and label can be optional
         if inst.poses.has_vals():
             pos_builder.feed_stream(inst.poses.vals)
         if inst.labels.has_vals():
             label_builder.feed_stream(inst.labels.vals)
     #
     w2vec = None
     if dconf.init_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set()
         for inst in extra_stream:
             for w in word_normer.norm_stream(inst.words.vals):
                 extra_word_set.add(w)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide dconf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = dconf.pretrain_file, dconf.code_pretrain
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(lambda ww, rank, val:
                             (val >= dconf.word_fthres and rank <= dconf.
                              word_rthres) or w2vec.has_key(ww))
         # then add extra ones
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres,
                                                 fthres=dconf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     # todo(+1): extra pos/label symbols?
     TARGET_END = VocabHelper.convert_special_pattern("unk")
     pos_vocab = pos_builder.finish(
         target_range=(1, TARGET_END))  # only real tags
     label_vocab = label_builder.finish(target_range=(1, TARGET_END))
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("label", label_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret
Пример #2
0
 def build_from_stream(conf: OverallConf, stream, extra_stream):
     dconf = conf.dconf
     zlog("Build vocabs from streams.")
     ret = IEVocabPackage({}, {}, dconf)
     # here, collect them all
     # -- basic inputs
     word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     lemma_builder = VocabBuilder("lemma")
     upos_builder = VocabBuilder("upos")
     ulabel_builder = VocabBuilder("ulabel")
     # -- outputs (event type, entity/filler type, arg role type) (type -> count)
     evt_type_builder = defaultdict(int)
     ef_type_builder = defaultdict(int)
     arg_role_builder = defaultdict(int)
     for inst in stream:
         # -- basic inputs
         for sent in inst.sents:
             word_builder.feed_stream(sent.words.vals)
             for w in sent.words.vals:
                 char_builder.feed_stream(w)
             lemma_builder.feed_stream(sent.lemmas.vals)
             upos_builder.feed_stream(sent.uposes.vals)
             ulabel_builder.feed_stream(sent.ud_labels.vals)
         # -- outputs
         # assert inst.entity_fillers is not None, "For building vocabs, need to provide training instances!"
         assert inst.events is not None, "For building vocabs, need to provide training instances!"
         if inst.entity_fillers is not None:
             for one_ef in inst.entity_fillers:
                 ef_type_builder[one_ef.type] += 1
         for one_evt in inst.events:
             evt_type_builder[one_evt.type] += 1
             if one_evt.links is not None:
                 for one_arg in one_evt.links:
                     arg_role_builder[one_arg.role] += 1
     # build real hlabel-types
     hl_evt = HLabelVocab("event", conf.mconf.hl_evt, evt_type_builder)
     hl_ef = HLabelVocab("entity_filler", conf.mconf.hl_ef, ef_type_builder)
     hl_arg = HLabelVocab("arg", conf.mconf.hl_arg, arg_role_builder)
     # deal with pre-trained word embeddings
     w2vec = None
     if dconf.init_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         # collect extra words and lemmas
         extra_word_set = set()
         extra_lemma_set = set()
         for inst in extra_stream:
             for sent in inst.sents:
                 for w in sent.words.vals:
                     extra_word_set.add(w)
                 for w in sent.lemmas.vals:
                     extra_lemma_set.add(w)
         # must provide dconf.pretrain_file
         w2vec = WordVectors.load(dconf.pretrain_file)
         # first filter according to thresholds
         word_builder.filter(lambda ww, rank, val:
                             (val >= dconf.word_fthres and rank <= dconf.
                              word_rthres) or w2vec.has_key(ww))
         lemma_builder.filter(lambda ww, rank, val:
                              (val >= dconf.word_fthres and rank <= dconf.
                               word_rthres) or w2vec.has_key(ww))
         # then add extra ones
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         for w in extra_lemma_set:
             if w2vec.has_key(w) and (
                     not lemma_builder.has_key_currently(w)):
                 lemma_builder.feed_one(w)
         # finially build the vocab and embeds
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
         lemma_vocab = lemma_builder.finish()
         lemma_embed1 = lemma_vocab.filter_embed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             scale=dconf.pretrain_scale)
         # first build pool-embeds, the final decision will depend on each of the flags
         # todo(WARN): assert all hit?
         hl_evt_pembed = hl_evt.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         hl_ef_pembed = hl_ef.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         hl_arg_pembed = hl_arg.filter_pembed(
             w2vec,
             init_nohit=dconf.pretrain_init_nohit,
             assert_all_hit=False)
         # by-product of filtered output pre-trained embeddings for later faster processing
         if dconf.output_pretrain_filter:
             w2vec.save_hits(dconf.output_pretrain_filter)
     else:
         word_vocab = word_builder.finish_thresh(rthres=dconf.word_rthres,
                                                 fthres=dconf.word_fthres)
         lemma_vocab = lemma_builder.finish_thresh(rthres=dconf.word_rthres,
                                                   fthres=dconf.word_fthres)
         word_embed1 = lemma_embed1 = None
         #
         for one_cc in [
                 conf.mconf.hl_evt, conf.mconf.hl_ef, conf.mconf.hl_arg
         ]:
             if hasattr(one_cc, "pool_init_hint"):
                 assert not one_cc.pool_init_hint, "cannot init pool because the overall pre-train-init flag is not set"
         hl_evt_pembed = hl_ef_pembed = hl_arg_pembed = None
     char_vocab = char_builder.finish()
     upos_vocab = upos_builder.finish()
     ulabel_vocab = ulabel_builder.finish()
     # =====
     # finally assign things
     ret.put_voc("word", word_vocab)
     ret.put_voc("lemma", lemma_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("upos", upos_vocab)
     ret.put_voc("ulabel", ulabel_vocab)
     ret.put_emb("word", word_embed1)
     ret.put_emb("lemma", lemma_embed1)
     # don't need to be jsonable since we are using pickle all at once
     # todo(WARN): the conf in vocab is also stored!!
     ret.put_voc("hl_evt", hl_evt)
     ret.put_voc("hl_ef", hl_ef)
     ret.put_voc("hl_arg", hl_arg)
     ret.put_emb("hl_evt", hl_evt_pembed)
     ret.put_emb("hl_ef", hl_ef_pembed)
     ret.put_emb("hl_arg", hl_arg_pembed)
     return ret
Пример #3
0
 def build_from_stream(build_conf: MLMVocabPackageConf, stream,
                       extra_stream):
     zlog("Build vocabs from streams.")
     ret = MLMVocabPackage({}, {})
     # -----
     if build_conf.add_ud2_pos_backoffs:
         ud2_pos_pre_list = list(VocabBuilder.DEFAULT_PRE_LIST) + [
             UD2_POS_UNK_MAP[p] for p in UD2_POS_LIST
         ]
         word_builder = VocabBuilder("word", pre_list=ud2_pos_pre_list)
     else:
         word_builder = VocabBuilder("word")
     char_builder = VocabBuilder("char")
     pos_builder = VocabBuilder("pos")
     deplabel_builder = VocabBuilder("deplabel")
     ner_builder = VocabBuilder("ner")
     if build_conf.add_ud2_prevalues:
         zlog(
             f"Add pre-defined UD2 values for upos({len(UD2_POS_LIST)}) and ulabel({len(UD2_LABEL_LIST)})."
         )
         pos_builder.feed_stream(UD2_POS_LIST)
         deplabel_builder.feed_stream(UD2_LABEL_LIST)
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
         for w in inst.word_seq.vals:
             char_builder.feed_stream(w)
         # todo(+N): currently we are assuming that we are using UD pos/deps, and directly go with the default ones
         # pos and label can be optional??
         # if inst.poses.has_vals():
         #     pos_builder.feed_stream(inst.poses.vals)
         # if inst.deplabels.has_vals():
         #     deplabel_builder.feed_stream(inst.deplabels.vals)
         if hasattr(inst, "ner_seq") and inst.ner_seq.has_vals():
             ner_builder.feed_stream(inst.ner_seq.vals)
     # ===== embeddings
     w2vec = None
     if build_conf.read_from_pretrain:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         # ----- load (possibly multiple) pretrain embeddings
         # must provide build_conf.pretrain_file (there can be multiple pretrain files!)
         list_pretrain_file, list_code_pretrain = build_conf.pretrain_file, build_conf.pretrain_codes
         list_code_pretrain.extend(
             [""] * len(list_pretrain_file))  # pad default ones
         w2vec = WordVectors.load(list_pretrain_file[0],
                                  aug_code=list_code_pretrain[0])
         if len(list_pretrain_file) > 1:
             w2vec.merge_others([
                 WordVectors.load(list_pretrain_file[i],
                                  aug_code=list_code_pretrain[i])
                 for i in range(1, len(list_pretrain_file))
             ])
         # -----
         # first filter according to thresholds
         word_builder.filter(
             lambda ww, rank, val: (val >= build_conf.word_fthres and rank
                                    <= build_conf.word_rthres) or
             (build_conf.ignore_thresh_with_pretrain and w2vec.has_key(ww)))
         # then add extra ones
         if build_conf.ignore_thresh_with_pretrain:
             for w in extra_word_set:
                 if w2vec.has_key(w) and (
                         not word_builder.has_key_currently(w)):
                     word_builder.feed_one(w)
         word_vocab = word_builder.finish()
         word_embed1 = word_vocab.filter_embed(
             w2vec,
             init_nohit=build_conf.pretrain_init_nohit,
             scale=build_conf.pretrain_scale)
     else:
         word_vocab = word_builder.finish_thresh(
             rthres=build_conf.word_rthres, fthres=build_conf.word_fthres)
         word_embed1 = None
     #
     char_vocab = char_builder.finish()
     pos_vocab = pos_builder.finish(sort_by_count=False)
     deplabel_vocab = deplabel_builder.finish(sort_by_count=False)
     ner_vocab = ner_builder.finish()
     # assign
     ret.put_voc("word", word_vocab)
     ret.put_voc("char", char_vocab)
     ret.put_voc("pos", pos_vocab)
     ret.put_voc("deplabel", deplabel_vocab)
     ret.put_voc("ner", ner_vocab)
     ret.put_emb("word", word_embed1)
     #
     return ret