Exemplo n.º 1
0
def do_stat_srl(insts):
    cc = Counter()
    cc_narg = Counter()
    voc = SimpleVocab.build_empty()
    # set_ee_heads(insts)
    voc_pred, voc_arg = SimpleVocab.build_empty(), SimpleVocab.build_empty()
    voc_deplab = SimpleVocab.build_empty()
    for sent in yield_sents(insts):
        cc["sent"] += 1
        cc["tok"] += len(sent)
        cc["frame"] += len(sent.events)
        # --
        _tree = sent.tree_dep
        if _tree is not None:
            voc_deplab.feed_iter(_tree.seq_label.vals)
        for evt in sent.events:
            voc_pred.feed_one(evt.label)
            evt_widx = evt.mention.shead_widx
            cc_narg[f"NARG={len(evt.args)}"] += 1
            for arg in evt.args:
                voc_arg.feed_one(arg.label)
                cc["arg"] += 1
                # check arg overlap
                for a2 in evt.args:
                    if a2 is arg: continue  # not self
                    if not (arg.mention.widx >= a2.mention.wridx
                            or a2.mention.widx >= arg.mention.wridx):
                        cc["arg_overlap"] += 1
                    else:
                        cc["arg_overlap"] += 0
    # --
    voc.build_sort()
    voc_pred.build_sort()
    voc_arg.build_sort()
    voc_deplab.build_sort()
    # --
    # get more stat
    cc2 = dict(cc)
    cc2.update({
        "t/s": f"{cc['tok']/cc['sent']:.2f}",
        "f/s": f"{cc['frame']/cc['sent']:.2f}",
        "a/f": f"{cc['arg']/cc['frame']:.2f}"
    })
    zlog(f"CC: {cc2}")
    zlog(cc_narg)
    zlog(voc_arg.counts)
    # --
    MAX_PRINT_ITEMS = 20
    d_pred = voc_pred.get_info_table()
    print(d_pred[:MAX_PRINT_ITEMS].to_string())
    d_arg = voc_arg.get_info_table()
    print(d_arg[:MAX_PRINT_ITEMS].to_string())
    d_deplab = voc_deplab.get_info_table()
    print(d_deplab[:MAX_PRINT_ITEMS].to_string())
    d = voc.get_info_table()
    print(d[:MAX_PRINT_ITEMS].to_string())
Exemplo n.º 2
0
def do_stat(insts):
    cc = Counter()
    voc = SimpleVocab.build_empty()
    for sent in yield_sents(insts):
        cc["sent"] += 1
        cc["tok"] += len(sent)
        cc["tok_pair"] += len(sent)**2
        _tree = sent.tree_dep
        _deplabs = _tree.seq_label.vals
        _slen = len(sent)
        for i0 in range(_slen):
            for i1 in range(_slen):
                if abs(i0 - i1) > 5:
                    continue
                path1, path2 = _tree.get_path(i0, i1)
                labs1, labs2 = sorted(
                    [[_deplabs[z].split(":")[0] for z in path1],
                     [_deplabs[z].split(":")[0] for z in path2]])
                _len = len(labs1) + len(labs2)
                # if _len<=0 or _len>2 or "punct" in labs1 or "punct" in labs2:
                if _len != 2 or "punct" in labs1 or "punct" in labs2:
                    continue
                _k = (tuple(labs1), tuple(labs2))
                voc.feed_one(_k)
    # --
    zlog(cc)
    voc.build_sort()
    d = voc.get_info_table()
    print(d[:100].to_string())
Exemplo n.º 3
0
 def build_vocab(self, datasets: List):
     voc_upos = SimpleVocab.build_empty(self.name)
     for dataset in datasets:
         for sent in yield_sents(dataset.insts):
             voc_upos.feed_iter(sent.seq_upos.vals)
     # finnished
     voc_upos.build_sort()
     return (voc_upos, )
Exemplo n.º 4
0
 def build_vocab(self, datasets: List):
     conf: ZTaskUdepConf = self.conf
     # --
     voc_udep = SimpleVocab.build_empty(self.name)
     for dataset in datasets:
         for sent in yield_sents(dataset.insts):
             _vals = sent.tree_dep.seq_label.vals
             if conf.use_l1:
                 _vals = [z.split(":")[0] for z in _vals]
             voc_udep.feed_iter(_vals)
     voc_udep.build_sort()
     _, udep_direct_range = voc_udep.non_special_range()  # range of direct labels
     zlog(f"Finish building voc_udep: {voc_udep}")
     return (voc_udep, udep_direct_range)
Exemplo n.º 5
0
 def build_from_stream(dconf: DConf, stream, extra_stream):
     zlog("Build vocabs from streams.")
     # here, collect them all
     # -- basic inputs
     voc_word = SimpleVocab.build_empty("word")
     voc_lemma = SimpleVocab.build_empty("lemma")
     voc_upos = SimpleVocab.build_empty("upos")
     voc_char = SimpleVocab.build_empty("char")
     voc_deplab = SimpleVocab.build_empty("deplab")
     # -- frame ones
     voc_evt, voc_ef, voc_arg = SimpleVocab.build_empty("evt"), SimpleVocab.build_empty("ef"), SimpleVocab.build_empty("arg")
     voc_collections = {"word": voc_word, "lemma": voc_lemma, "upos": voc_upos, "char": voc_char, "deplab": voc_deplab,
                        "evt": voc_evt, "ef": voc_ef, "arg": voc_arg}
     # read all and build
     for sent in yield_sents(stream):
         # -- basic inputs
         if sent.seq_word is not None:
             voc_word.feed_iter(sent.seq_word.vals)
             for w in sent.seq_word.vals:
                 voc_char.feed_iter(w)
         if sent.seq_lemma is not None:
             voc_lemma.feed_iter(sent.seq_lemma.vals)
         if sent.seq_upos is not None:
             voc_upos.feed_iter(sent.seq_upos.vals)
         if sent.tree_dep is not None and sent.tree_dep.seq_label is not None:
             voc_deplab.feed_iter(sent.tree_dep.seq_label.vals)
         # -- frames
         if sent.entity_fillers is not None:
             voc_ef.feed_iter((ef.type for ef in sent.entity_fillers))
         if sent.events is not None:
             voc_evt.feed_iter((evt.type for evt in sent.events))
             for evt in sent.events:
                 if evt.args is not None:
                     voc_arg.feed_iter((arg.role for arg in evt.args))
     # sort everyone!
     for voc in voc_collections.values():
         voc.build_sort()
     # extra for evt/arg
     if dconf.dict_frame_file:
         frames = default_json_serializer.from_file(dconf.dict_frame_file)
         for one_f in frames.values():  # no count, simply feed!!
             if len(one_f["lexUnit"]) > 0:  # todo(+W): currently ignore non-lex frames
                 voc_evt.feed_one(one_f["name"], c=0)
                 for one_fe in one_f["FE"]:
                     voc_arg.feed_one(one_fe["name"], c=0)
         zlog(f"After adding frames from {dconf.dict_frame_file}, evt={voc_evt}, arg={voc_arg}")
     # -----
     # deal with pre-trained word embeddings
     w2vec = None
     if dconf.pretrain_wv_file:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         # collect extra words and lemmas
         extra_word_counts = {}
         extra_lemma_counts = {}
         for sent in yield_sents(extra_stream):
             if sent.seq_word is not None:
                 for w in sent.seq_word.vals:
                     extra_word_counts[w] = extra_word_counts.get(w, 0) + 1
             if sent.seq_lemma is not None:
                 for w in sent.seq_lemma.vals:
                     extra_lemma_counts[w] = extra_lemma_counts.get(w, 0) + 1
         # must provide dconf.pretrain_file
         w2vec = WordVectors.load(dconf.pretrain_wv_file)
         # first filter according to thresholds
         _filter_f = lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf.word_rthres) or \
                                           w2vec.find_key(ww) is not None
         voc_word.build_filter(_filter_f)
         voc_lemma.build_filter(_filter_f)
         # then add extra ones
         for w in sorted(extra_word_counts.keys(), key=lambda z: (-extra_word_counts[z], z)):
             if w2vec.find_key(w) is not None and (w not in voc_word):
                 voc_word.feed_one(w)
         for w in sorted(extra_lemma_counts.keys(), key=lambda z: (-extra_lemma_counts[z], z)):
             if w2vec.find_key(w) is not None and (w not in voc_lemma):
                 voc_lemma.feed_one(w)
         # by-product of filtered output pre-trained embeddings for later faster processing
         if dconf.pretrain_hits_outf:
             # find all keys again!!
             w2vec.clear_hits()
             for vv in [voc_word, voc_lemma]:
                 for _idx in range(*(vv.non_special_range())):
                     w2vec.find_key(vv.idx2word(_idx))
             w2vec.save_hits(dconf.pretrain_hits_outf)
         # embeds
         word_embed1 = voc_word.filter_embed(w2vec, scale=dconf.pretrain_scale)
         lemma_embed1 = voc_lemma.filter_embed(w2vec, scale=dconf.pretrain_scale)
     else:
         voc_word.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres)
         voc_lemma.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres)
         word_embed1 = lemma_embed1 = None
     # return
     ret = ZsfpVocabPackage(voc_collections, {"word": word_embed1, "lemma": lemma_embed1}, dconf)
     return ret