Exemplo n.º 1
0
def do_stat_srl(insts):
    cc = Counter()
    cc_narg = Counter()
    voc = SimpleVocab.build_empty()
    # set_ee_heads(insts)
    voc_pred, voc_arg = SimpleVocab.build_empty(), SimpleVocab.build_empty()
    voc_deplab = SimpleVocab.build_empty()
    for sent in yield_sents(insts):
        cc["sent"] += 1
        cc["tok"] += len(sent)
        cc["frame"] += len(sent.events)
        # --
        _tree = sent.tree_dep
        if _tree is not None:
            voc_deplab.feed_iter(_tree.seq_label.vals)
        for evt in sent.events:
            voc_pred.feed_one(evt.label)
            evt_widx = evt.mention.shead_widx
            cc_narg[f"NARG={len(evt.args)}"] += 1
            for arg in evt.args:
                voc_arg.feed_one(arg.label)
                cc["arg"] += 1
                # check arg overlap
                for a2 in evt.args:
                    if a2 is arg: continue  # not self
                    if not (arg.mention.widx >= a2.mention.wridx
                            or a2.mention.widx >= arg.mention.wridx):
                        cc["arg_overlap"] += 1
                    else:
                        cc["arg_overlap"] += 0
    # --
    voc.build_sort()
    voc_pred.build_sort()
    voc_arg.build_sort()
    voc_deplab.build_sort()
    # --
    # get more stat
    cc2 = dict(cc)
    cc2.update({
        "t/s": f"{cc['tok']/cc['sent']:.2f}",
        "f/s": f"{cc['frame']/cc['sent']:.2f}",
        "a/f": f"{cc['arg']/cc['frame']:.2f}"
    })
    zlog(f"CC: {cc2}")
    zlog(cc_narg)
    zlog(voc_arg.counts)
    # --
    MAX_PRINT_ITEMS = 20
    d_pred = voc_pred.get_info_table()
    print(d_pred[:MAX_PRINT_ITEMS].to_string())
    d_arg = voc_arg.get_info_table()
    print(d_arg[:MAX_PRINT_ITEMS].to_string())
    d_deplab = voc_deplab.get_info_table()
    print(d_deplab[:MAX_PRINT_ITEMS].to_string())
    d = voc.get_info_table()
    print(d[:MAX_PRINT_ITEMS].to_string())
Exemplo n.º 2
0
def do_stat(insts):
    cc = Counter()
    voc = SimpleVocab.build_empty()
    for sent in yield_sents(insts):
        cc["sent"] += 1
        cc["tok"] += len(sent)
        cc["tok_pair"] += len(sent)**2
        _tree = sent.tree_dep
        _deplabs = _tree.seq_label.vals
        _slen = len(sent)
        for i0 in range(_slen):
            for i1 in range(_slen):
                if abs(i0 - i1) > 5:
                    continue
                path1, path2 = _tree.get_path(i0, i1)
                labs1, labs2 = sorted(
                    [[_deplabs[z].split(":")[0] for z in path1],
                     [_deplabs[z].split(":")[0] for z in path2]])
                _len = len(labs1) + len(labs2)
                # if _len<=0 or _len>2 or "punct" in labs1 or "punct" in labs2:
                if _len != 2 or "punct" in labs1 or "punct" in labs2:
                    continue
                _k = (tuple(labs1), tuple(labs2))
                voc.feed_one(_k)
    # --
    zlog(cc)
    voc.build_sort()
    d = voc.get_info_table()
    print(d[:100].to_string())
Exemplo n.º 3
0
 def build_vocab(self, datasets: List):
     voc_upos = SimpleVocab.build_empty(self.name)
     for dataset in datasets:
         for sent in yield_sents(dataset.insts):
             voc_upos.feed_iter(sent.seq_upos.vals)
     # finnished
     voc_upos.build_sort()
     return (voc_upos, )
Exemplo n.º 4
0
def aug_words_and_embs(emb_node: EmbeddingNode,
                       orig_vocab: SimpleVocab,
                       aug_vocab: SimpleVocab,
                       aug_wv: WordVectors,
                       aug_scale: float = 1.0):
    orig_arr = emb_node.E.E.detach().cpu().numpy()
    # todo(+2): find same-spelling words in the original vocab if not-hit in the extra_embed?
    # todo(warn): here aug_vocab should be find in aug_wv
    aug_arr = aug_vocab.filter_embed(aug_wv,
                                     scale=aug_scale,
                                     assert_all_hit=True)
    new_vocab, new_arr = SimpleVocab.aug_vocab_and_arr(orig_vocab,
                                                       orig_arr,
                                                       aug_vocab,
                                                       aug_arr,
                                                       aug_override=True)
    # assign
    BK.set_value(emb_node.E.E, new_arr, resize=True)
    return new_vocab
Exemplo n.º 5
0
 def __init__(self, cons: Constrainer, src_vocab: SimpleVocab, trg_vocab: SimpleVocab, conf: ConstrainerNodeConf, **kwargs):
     super().__init__(conf, **kwargs)
     conf: ConstrainerNodeConf = self.conf
     # --
     # input vocab
     if src_vocab is None:  # make our own src_vocab
         cons_keys = sorted(cons.cmap.keys())  # simply get all the keys
         src_vocab = SimpleVocab.build_by_static(cons_keys, pre_list=["non"], post_list=None)  # non==0!
     # output vocab
     assert trg_vocab is not None
     out_size = len(trg_vocab)  # output size is len(trg_vocab)
     trg_is_seq_vocab = isinstance(trg_vocab, SeqVocab)
     _trg_get_f = (lambda x: trg_vocab.get_range_by_basename(x)) if trg_is_seq_vocab else (lambda x: trg_vocab.get(x))
     # set it up
     _vec = np.full((len(src_vocab), out_size), 0., dtype=np.float32)
     assert src_vocab.non == 0
     _vec[0] = 1.  # by default: src-non is all valid!
     _vec[:,0] = 1.  # by default: trg-non is all valid!
     # --
     stat = {"k_skip": 0, "k_hit": 0, "v_skip": 0, "v_hit": 1}
     for k, v in cons.cmap.items():
         idx_k = src_vocab.get(k)
         if idx_k is None:
             stat["k_skip"] += 1
             continue  # skip no_hit!
         stat["k_hit"] += 1
         for k2 in v.keys():
             idx_k2 = _trg_get_f(k2)
             if idx_k2 is None:
                 stat["v_skip"] += 1
                 continue
             stat["v_hit"] += 1
             if trg_is_seq_vocab:
                 _vec[idx_k, idx_k2[0]:idx_k2[1]] = 1.  # hit range
             else:
                 _vec[idx_k, idx_k2] = 1.  # hit!!
     zlog(f"Setup ConstrainerNode ok: vec={_vec.shape}, stat={stat}")
     # --
     self.cons = cons
     self.src_vocab = src_vocab
     self.trg_vocab = trg_vocab
     self.vec = BK.input_real(_vec)
Exemplo n.º 6
0
 def __init__(self, conf: ZDecoderUDEPConf, name: str, vocab_udep: SimpleVocab, ref_enc: ZEncoder, **kwargs):
     super().__init__(conf, name, **kwargs)
     conf: ZDecoderUDEPConf = self.conf
     self.vocab_udep = vocab_udep
     _enc_dim, _head_dim = ref_enc.get_enc_dim(), ref_enc.get_head_dim()
     # --
     self.helper = ZDecoderUDEPHelper(conf, self.vocab_udep)
     self._label_idx_root = vocab_udep.get("root")  # get root's index for decoding
     # --
     # nodes
     self.depth_node: IdecNode = conf.depth_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=1)
     self.udep_node: IdecNode = conf.udep_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=len(vocab_udep))
Exemplo n.º 7
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    enc = t_center.tasks['enc']
    # data
    d_center = DataCenter(conf.dconf)
    for dataset in d_center.get_datasets():
        enc.prepare_dataset(dataset)
        vv = SimpleVocab.build_by_static([])
        vv2 = SimpleVocab.build_by_static([])
        for item in dataset.items:
            vv.feed_one(item._batch_len)
            vv2.feed_one(sum(len(z) for z in item.sents) + 1)
        vv.build_sort(lambda w, i, c: w)
        vv2.build_sort(lambda w, i, c: w)
        zlog(
            f"#== For {dataset} (subword):\n{vv.get_info_table().to_string()}")
        zlog(f"#== For {dataset} (word):\n{vv2.get_info_table().to_string()}")
    # --
    zlog("The end of Building.")
Exemplo n.º 8
0
 def build_vocab(self, datasets: List):
     conf: ZTaskUdepConf = self.conf
     # --
     voc_udep = SimpleVocab.build_empty(self.name)
     for dataset in datasets:
         for sent in yield_sents(dataset.insts):
             _vals = sent.tree_dep.seq_label.vals
             if conf.use_l1:
                 _vals = [z.split(":")[0] for z in _vals]
             voc_udep.feed_iter(_vals)
     voc_udep.build_sort()
     _, udep_direct_range = voc_udep.non_special_range()  # range of direct labels
     zlog(f"Finish building voc_udep: {voc_udep}")
     return (voc_udep, udep_direct_range)
Exemplo n.º 9
0
def prepare_test(args):
    conf: OverallConf = init_everything(OverallConf(), args)
    dconf, tconf = conf.dconf, conf.tconf
    # vocab
    vpack = ZmtlVocabPackage.build_by_reading(dconf)
    # prepare data
    test_streamer = dconf.R.get_reader(input_path=dconf.test)
    # model
    model = build_model(conf, vpack=vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name, strict=dconf.model_load_strict)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_wv_files
    model_emb = model.get_emb()
    if model_emb is not None:
        _embedder = model_emb.eg.get_embedder("word")
        if len(extra_embed_files
               ) > 0 and _embedder is not None:  # has extra_emb and need_emb
            # get embeddings
            extra_embedding = WordVectors.load(extra_embed_files[0])
            extra_embedding.merge_others([
                WordVectors.load(one_file)
                for one_file in extra_embed_files[1:]
            ])
            # get extra dictionary (only those words hit in extra-embed)
            extra_vocab = SimpleVocab.build_by_static(get_extra_hit_words(
                test_streamer, extra_embedding, vpack.get_voc("word")),
                                                      pre_list=None,
                                                      post_list=None)
            # give them to the model
            new_vocab = aug_words_and_embs(_embedder,
                                           vpack.get_voc("word"),
                                           extra_vocab,
                                           extra_embedding,
                                           aug_scale=dconf.pretrain_scale)
            vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter, _ = batch_stream(
        index_stream(test_streamer, vpack, False, False, test_inst_preparer),
        tconf, False)
    return conf, model, vpack, test_iter
Exemplo n.º 10
0
def main(vocab_file: str, input_path: str, output_file='lt.pkl'):
    # first get vocab
    vocabs = default_pickle_serializer.from_file(vocab_file)
    arg_voc = vocabs[0]['arg']
    zlog(f"Read {arg_voc} from {vocab_file}")
    # make it to BIO-vocab
    bio_voc = SeqVocab(arg_voc)
    zlog(f"Build bio-voc of {bio_voc}")
    # read insts
    insts = list(ReaderGetterConf().get_reader(
        input_path=input_path))  # read from stdin
    all_sents = list(yield_sents(insts))
    # --
    mat = np.ones([len(bio_voc), len(bio_voc)],
                  dtype=np.float32)  # add-1 smoothing!
    cc = Counter()
    for sent in all_sents:
        for evt in sent.events:
            labels = ['O'] * len(sent)
            for arg in evt.args:
                widx, wlen = arg.mention.get_span()
                labels[widx:wlen] = ["B-" + arg.role
                                     ] + ["I-" + arg.role] * (wlen - 1)
            for a, b in zip(labels, labels[1:]):
                cc[f"{a}->{b}"] += 1
                mat[bio_voc[a], bio_voc[b]] += 1
        # --
    # --
    v = SimpleVocab()
    for name, count in cc.items():
        v.feed_one(name, count)
    v.build_sort()
    print(v.get_info_table()[:50].to_string())
    # OtherHelper.printd(cc)
    # --
    # normalize & log according to row and save
    mat = mat / mat.sum(-1, keepdims=True)
    mat = np.log(mat)
    default_pickle_serializer.to_file(mat, output_file)
Exemplo n.º 11
0
 def build_from_stream(dconf: DConf, stream, extra_stream):
     zlog("Build vocabs from streams.")
     # here, collect them all
     # -- basic inputs
     voc_word = SimpleVocab.build_empty("word")
     voc_lemma = SimpleVocab.build_empty("lemma")
     voc_upos = SimpleVocab.build_empty("upos")
     voc_char = SimpleVocab.build_empty("char")
     voc_deplab = SimpleVocab.build_empty("deplab")
     # -- frame ones
     voc_evt, voc_ef, voc_arg = SimpleVocab.build_empty("evt"), SimpleVocab.build_empty("ef"), SimpleVocab.build_empty("arg")
     voc_collections = {"word": voc_word, "lemma": voc_lemma, "upos": voc_upos, "char": voc_char, "deplab": voc_deplab,
                        "evt": voc_evt, "ef": voc_ef, "arg": voc_arg}
     # read all and build
     for sent in yield_sents(stream):
         # -- basic inputs
         if sent.seq_word is not None:
             voc_word.feed_iter(sent.seq_word.vals)
             for w in sent.seq_word.vals:
                 voc_char.feed_iter(w)
         if sent.seq_lemma is not None:
             voc_lemma.feed_iter(sent.seq_lemma.vals)
         if sent.seq_upos is not None:
             voc_upos.feed_iter(sent.seq_upos.vals)
         if sent.tree_dep is not None and sent.tree_dep.seq_label is not None:
             voc_deplab.feed_iter(sent.tree_dep.seq_label.vals)
         # -- frames
         if sent.entity_fillers is not None:
             voc_ef.feed_iter((ef.type for ef in sent.entity_fillers))
         if sent.events is not None:
             voc_evt.feed_iter((evt.type for evt in sent.events))
             for evt in sent.events:
                 if evt.args is not None:
                     voc_arg.feed_iter((arg.role for arg in evt.args))
     # sort everyone!
     for voc in voc_collections.values():
         voc.build_sort()
     # extra for evt/arg
     if dconf.dict_frame_file:
         frames = default_json_serializer.from_file(dconf.dict_frame_file)
         for one_f in frames.values():  # no count, simply feed!!
             if len(one_f["lexUnit"]) > 0:  # todo(+W): currently ignore non-lex frames
                 voc_evt.feed_one(one_f["name"], c=0)
                 for one_fe in one_f["FE"]:
                     voc_arg.feed_one(one_fe["name"], c=0)
         zlog(f"After adding frames from {dconf.dict_frame_file}, evt={voc_evt}, arg={voc_arg}")
     # -----
     # deal with pre-trained word embeddings
     w2vec = None
     if dconf.pretrain_wv_file:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         # collect extra words and lemmas
         extra_word_counts = {}
         extra_lemma_counts = {}
         for sent in yield_sents(extra_stream):
             if sent.seq_word is not None:
                 for w in sent.seq_word.vals:
                     extra_word_counts[w] = extra_word_counts.get(w, 0) + 1
             if sent.seq_lemma is not None:
                 for w in sent.seq_lemma.vals:
                     extra_lemma_counts[w] = extra_lemma_counts.get(w, 0) + 1
         # must provide dconf.pretrain_file
         w2vec = WordVectors.load(dconf.pretrain_wv_file)
         # first filter according to thresholds
         _filter_f = lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf.word_rthres) or \
                                           w2vec.find_key(ww) is not None
         voc_word.build_filter(_filter_f)
         voc_lemma.build_filter(_filter_f)
         # then add extra ones
         for w in sorted(extra_word_counts.keys(), key=lambda z: (-extra_word_counts[z], z)):
             if w2vec.find_key(w) is not None and (w not in voc_word):
                 voc_word.feed_one(w)
         for w in sorted(extra_lemma_counts.keys(), key=lambda z: (-extra_lemma_counts[z], z)):
             if w2vec.find_key(w) is not None and (w not in voc_lemma):
                 voc_lemma.feed_one(w)
         # by-product of filtered output pre-trained embeddings for later faster processing
         if dconf.pretrain_hits_outf:
             # find all keys again!!
             w2vec.clear_hits()
             for vv in [voc_word, voc_lemma]:
                 for _idx in range(*(vv.non_special_range())):
                     w2vec.find_key(vv.idx2word(_idx))
             w2vec.save_hits(dconf.pretrain_hits_outf)
         # embeds
         word_embed1 = voc_word.filter_embed(w2vec, scale=dconf.pretrain_scale)
         lemma_embed1 = voc_lemma.filter_embed(w2vec, scale=dconf.pretrain_scale)
     else:
         voc_word.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres)
         voc_lemma.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres)
         word_embed1 = lemma_embed1 = None
     # return
     ret = ZsfpVocabPackage(voc_collections, {"word": word_embed1, "lemma": lemma_embed1}, dconf)
     return ret
Exemplo n.º 12
0
 def index_char_seq(self, seq: InputCharSeqField, voc: SimpleVocab, allow_unk: bool):
     if seq is not None:
         seq_idxes = [[voc.get_else_unk(c) for c in z] for z in seq.vals] if allow_unk else [[voc[c] for c in z] for z in seq.vals]
         seq.set_idxes(seq_idxes)
Exemplo n.º 13
0
 def index_items(self, items, voc: SimpleVocab, allow_unk: bool):
     for item in items:
         item.set_label_idx(voc.get_else_unk(item.label) if allow_unk else voc[item.label])
Exemplo n.º 14
0
 def index_seq(self, seq: SeqField, voc: SimpleVocab, allow_unk: bool):
     if seq is not None:
         seq_idxes = [voc.get_else_unk(z) for z in seq.vals] if allow_unk else [voc[z] for z in seq.vals]
         seq.set_idxes(seq_idxes)