def do_stat_srl(insts): cc = Counter() cc_narg = Counter() voc = SimpleVocab.build_empty() # set_ee_heads(insts) voc_pred, voc_arg = SimpleVocab.build_empty(), SimpleVocab.build_empty() voc_deplab = SimpleVocab.build_empty() for sent in yield_sents(insts): cc["sent"] += 1 cc["tok"] += len(sent) cc["frame"] += len(sent.events) # -- _tree = sent.tree_dep if _tree is not None: voc_deplab.feed_iter(_tree.seq_label.vals) for evt in sent.events: voc_pred.feed_one(evt.label) evt_widx = evt.mention.shead_widx cc_narg[f"NARG={len(evt.args)}"] += 1 for arg in evt.args: voc_arg.feed_one(arg.label) cc["arg"] += 1 # check arg overlap for a2 in evt.args: if a2 is arg: continue # not self if not (arg.mention.widx >= a2.mention.wridx or a2.mention.widx >= arg.mention.wridx): cc["arg_overlap"] += 1 else: cc["arg_overlap"] += 0 # -- voc.build_sort() voc_pred.build_sort() voc_arg.build_sort() voc_deplab.build_sort() # -- # get more stat cc2 = dict(cc) cc2.update({ "t/s": f"{cc['tok']/cc['sent']:.2f}", "f/s": f"{cc['frame']/cc['sent']:.2f}", "a/f": f"{cc['arg']/cc['frame']:.2f}" }) zlog(f"CC: {cc2}") zlog(cc_narg) zlog(voc_arg.counts) # -- MAX_PRINT_ITEMS = 20 d_pred = voc_pred.get_info_table() print(d_pred[:MAX_PRINT_ITEMS].to_string()) d_arg = voc_arg.get_info_table() print(d_arg[:MAX_PRINT_ITEMS].to_string()) d_deplab = voc_deplab.get_info_table() print(d_deplab[:MAX_PRINT_ITEMS].to_string()) d = voc.get_info_table() print(d[:MAX_PRINT_ITEMS].to_string())
def do_stat(insts): cc = Counter() voc = SimpleVocab.build_empty() for sent in yield_sents(insts): cc["sent"] += 1 cc["tok"] += len(sent) cc["tok_pair"] += len(sent)**2 _tree = sent.tree_dep _deplabs = _tree.seq_label.vals _slen = len(sent) for i0 in range(_slen): for i1 in range(_slen): if abs(i0 - i1) > 5: continue path1, path2 = _tree.get_path(i0, i1) labs1, labs2 = sorted( [[_deplabs[z].split(":")[0] for z in path1], [_deplabs[z].split(":")[0] for z in path2]]) _len = len(labs1) + len(labs2) # if _len<=0 or _len>2 or "punct" in labs1 or "punct" in labs2: if _len != 2 or "punct" in labs1 or "punct" in labs2: continue _k = (tuple(labs1), tuple(labs2)) voc.feed_one(_k) # -- zlog(cc) voc.build_sort() d = voc.get_info_table() print(d[:100].to_string())
def build_vocab(self, datasets: List): voc_upos = SimpleVocab.build_empty(self.name) for dataset in datasets: for sent in yield_sents(dataset.insts): voc_upos.feed_iter(sent.seq_upos.vals) # finnished voc_upos.build_sort() return (voc_upos, )
def aug_words_and_embs(emb_node: EmbeddingNode, orig_vocab: SimpleVocab, aug_vocab: SimpleVocab, aug_wv: WordVectors, aug_scale: float = 1.0): orig_arr = emb_node.E.E.detach().cpu().numpy() # todo(+2): find same-spelling words in the original vocab if not-hit in the extra_embed? # todo(warn): here aug_vocab should be find in aug_wv aug_arr = aug_vocab.filter_embed(aug_wv, scale=aug_scale, assert_all_hit=True) new_vocab, new_arr = SimpleVocab.aug_vocab_and_arr(orig_vocab, orig_arr, aug_vocab, aug_arr, aug_override=True) # assign BK.set_value(emb_node.E.E, new_arr, resize=True) return new_vocab
def __init__(self, cons: Constrainer, src_vocab: SimpleVocab, trg_vocab: SimpleVocab, conf: ConstrainerNodeConf, **kwargs): super().__init__(conf, **kwargs) conf: ConstrainerNodeConf = self.conf # -- # input vocab if src_vocab is None: # make our own src_vocab cons_keys = sorted(cons.cmap.keys()) # simply get all the keys src_vocab = SimpleVocab.build_by_static(cons_keys, pre_list=["non"], post_list=None) # non==0! # output vocab assert trg_vocab is not None out_size = len(trg_vocab) # output size is len(trg_vocab) trg_is_seq_vocab = isinstance(trg_vocab, SeqVocab) _trg_get_f = (lambda x: trg_vocab.get_range_by_basename(x)) if trg_is_seq_vocab else (lambda x: trg_vocab.get(x)) # set it up _vec = np.full((len(src_vocab), out_size), 0., dtype=np.float32) assert src_vocab.non == 0 _vec[0] = 1. # by default: src-non is all valid! _vec[:,0] = 1. # by default: trg-non is all valid! # -- stat = {"k_skip": 0, "k_hit": 0, "v_skip": 0, "v_hit": 1} for k, v in cons.cmap.items(): idx_k = src_vocab.get(k) if idx_k is None: stat["k_skip"] += 1 continue # skip no_hit! stat["k_hit"] += 1 for k2 in v.keys(): idx_k2 = _trg_get_f(k2) if idx_k2 is None: stat["v_skip"] += 1 continue stat["v_hit"] += 1 if trg_is_seq_vocab: _vec[idx_k, idx_k2[0]:idx_k2[1]] = 1. # hit range else: _vec[idx_k, idx_k2] = 1. # hit!! zlog(f"Setup ConstrainerNode ok: vec={_vec.shape}, stat={stat}") # -- self.cons = cons self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.vec = BK.input_real(_vec)
def __init__(self, conf: ZDecoderUDEPConf, name: str, vocab_udep: SimpleVocab, ref_enc: ZEncoder, **kwargs): super().__init__(conf, name, **kwargs) conf: ZDecoderUDEPConf = self.conf self.vocab_udep = vocab_udep _enc_dim, _head_dim = ref_enc.get_enc_dim(), ref_enc.get_head_dim() # -- self.helper = ZDecoderUDEPHelper(conf, self.vocab_udep) self._label_idx_root = vocab_udep.get("root") # get root's index for decoding # -- # nodes self.depth_node: IdecNode = conf.depth_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=1) self.udep_node: IdecNode = conf.udep_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=len(vocab_udep))
def main(args): # conf conf: ZOverallConf = init_everything(ZOverallConf(), args) # task t_center = TaskCenter(conf.tconf) enc = t_center.tasks['enc'] # data d_center = DataCenter(conf.dconf) for dataset in d_center.get_datasets(): enc.prepare_dataset(dataset) vv = SimpleVocab.build_by_static([]) vv2 = SimpleVocab.build_by_static([]) for item in dataset.items: vv.feed_one(item._batch_len) vv2.feed_one(sum(len(z) for z in item.sents) + 1) vv.build_sort(lambda w, i, c: w) vv2.build_sort(lambda w, i, c: w) zlog( f"#== For {dataset} (subword):\n{vv.get_info_table().to_string()}") zlog(f"#== For {dataset} (word):\n{vv2.get_info_table().to_string()}") # -- zlog("The end of Building.")
def build_vocab(self, datasets: List): conf: ZTaskUdepConf = self.conf # -- voc_udep = SimpleVocab.build_empty(self.name) for dataset in datasets: for sent in yield_sents(dataset.insts): _vals = sent.tree_dep.seq_label.vals if conf.use_l1: _vals = [z.split(":")[0] for z in _vals] voc_udep.feed_iter(_vals) voc_udep.build_sort() _, udep_direct_range = voc_udep.non_special_range() # range of direct labels zlog(f"Finish building voc_udep: {voc_udep}") return (voc_udep, udep_direct_range)
def prepare_test(args): conf: OverallConf = init_everything(OverallConf(), args) dconf, tconf = conf.dconf, conf.tconf # vocab vpack = ZmtlVocabPackage.build_by_reading(dconf) # prepare data test_streamer = dconf.R.get_reader(input_path=dconf.test) # model model = build_model(conf, vpack=vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name, strict=dconf.model_load_strict) else: zwarn("No model to load, Debugging mode??") # ===== # augment with extra embeddings extra_embed_files = dconf.test_extra_pretrain_wv_files model_emb = model.get_emb() if model_emb is not None: _embedder = model_emb.eg.get_embedder("word") if len(extra_embed_files ) > 0 and _embedder is not None: # has extra_emb and need_emb # get embeddings extra_embedding = WordVectors.load(extra_embed_files[0]) extra_embedding.merge_others([ WordVectors.load(one_file) for one_file in extra_embed_files[1:] ]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = SimpleVocab.build_by_static(get_extra_hit_words( test_streamer, extra_embedding, vpack.get_voc("word")), pre_list=None, post_list=None) # give them to the model new_vocab = aug_words_and_embs(_embedder, vpack.get_voc("word"), extra_vocab, extra_embedding, aug_scale=dconf.pretrain_scale) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter, _ = batch_stream( index_stream(test_streamer, vpack, False, False, test_inst_preparer), tconf, False) return conf, model, vpack, test_iter
def main(vocab_file: str, input_path: str, output_file='lt.pkl'): # first get vocab vocabs = default_pickle_serializer.from_file(vocab_file) arg_voc = vocabs[0]['arg'] zlog(f"Read {arg_voc} from {vocab_file}") # make it to BIO-vocab bio_voc = SeqVocab(arg_voc) zlog(f"Build bio-voc of {bio_voc}") # read insts insts = list(ReaderGetterConf().get_reader( input_path=input_path)) # read from stdin all_sents = list(yield_sents(insts)) # -- mat = np.ones([len(bio_voc), len(bio_voc)], dtype=np.float32) # add-1 smoothing! cc = Counter() for sent in all_sents: for evt in sent.events: labels = ['O'] * len(sent) for arg in evt.args: widx, wlen = arg.mention.get_span() labels[widx:wlen] = ["B-" + arg.role ] + ["I-" + arg.role] * (wlen - 1) for a, b in zip(labels, labels[1:]): cc[f"{a}->{b}"] += 1 mat[bio_voc[a], bio_voc[b]] += 1 # -- # -- v = SimpleVocab() for name, count in cc.items(): v.feed_one(name, count) v.build_sort() print(v.get_info_table()[:50].to_string()) # OtherHelper.printd(cc) # -- # normalize & log according to row and save mat = mat / mat.sum(-1, keepdims=True) mat = np.log(mat) default_pickle_serializer.to_file(mat, output_file)
def build_from_stream(dconf: DConf, stream, extra_stream): zlog("Build vocabs from streams.") # here, collect them all # -- basic inputs voc_word = SimpleVocab.build_empty("word") voc_lemma = SimpleVocab.build_empty("lemma") voc_upos = SimpleVocab.build_empty("upos") voc_char = SimpleVocab.build_empty("char") voc_deplab = SimpleVocab.build_empty("deplab") # -- frame ones voc_evt, voc_ef, voc_arg = SimpleVocab.build_empty("evt"), SimpleVocab.build_empty("ef"), SimpleVocab.build_empty("arg") voc_collections = {"word": voc_word, "lemma": voc_lemma, "upos": voc_upos, "char": voc_char, "deplab": voc_deplab, "evt": voc_evt, "ef": voc_ef, "arg": voc_arg} # read all and build for sent in yield_sents(stream): # -- basic inputs if sent.seq_word is not None: voc_word.feed_iter(sent.seq_word.vals) for w in sent.seq_word.vals: voc_char.feed_iter(w) if sent.seq_lemma is not None: voc_lemma.feed_iter(sent.seq_lemma.vals) if sent.seq_upos is not None: voc_upos.feed_iter(sent.seq_upos.vals) if sent.tree_dep is not None and sent.tree_dep.seq_label is not None: voc_deplab.feed_iter(sent.tree_dep.seq_label.vals) # -- frames if sent.entity_fillers is not None: voc_ef.feed_iter((ef.type for ef in sent.entity_fillers)) if sent.events is not None: voc_evt.feed_iter((evt.type for evt in sent.events)) for evt in sent.events: if evt.args is not None: voc_arg.feed_iter((arg.role for arg in evt.args)) # sort everyone! for voc in voc_collections.values(): voc.build_sort() # extra for evt/arg if dconf.dict_frame_file: frames = default_json_serializer.from_file(dconf.dict_frame_file) for one_f in frames.values(): # no count, simply feed!! if len(one_f["lexUnit"]) > 0: # todo(+W): currently ignore non-lex frames voc_evt.feed_one(one_f["name"], c=0) for one_fe in one_f["FE"]: voc_arg.feed_one(one_fe["name"], c=0) zlog(f"After adding frames from {dconf.dict_frame_file}, evt={voc_evt}, arg={voc_arg}") # ----- # deal with pre-trained word embeddings w2vec = None if dconf.pretrain_wv_file: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs # collect extra words and lemmas extra_word_counts = {} extra_lemma_counts = {} for sent in yield_sents(extra_stream): if sent.seq_word is not None: for w in sent.seq_word.vals: extra_word_counts[w] = extra_word_counts.get(w, 0) + 1 if sent.seq_lemma is not None: for w in sent.seq_lemma.vals: extra_lemma_counts[w] = extra_lemma_counts.get(w, 0) + 1 # must provide dconf.pretrain_file w2vec = WordVectors.load(dconf.pretrain_wv_file) # first filter according to thresholds _filter_f = lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf.word_rthres) or \ w2vec.find_key(ww) is not None voc_word.build_filter(_filter_f) voc_lemma.build_filter(_filter_f) # then add extra ones for w in sorted(extra_word_counts.keys(), key=lambda z: (-extra_word_counts[z], z)): if w2vec.find_key(w) is not None and (w not in voc_word): voc_word.feed_one(w) for w in sorted(extra_lemma_counts.keys(), key=lambda z: (-extra_lemma_counts[z], z)): if w2vec.find_key(w) is not None and (w not in voc_lemma): voc_lemma.feed_one(w) # by-product of filtered output pre-trained embeddings for later faster processing if dconf.pretrain_hits_outf: # find all keys again!! w2vec.clear_hits() for vv in [voc_word, voc_lemma]: for _idx in range(*(vv.non_special_range())): w2vec.find_key(vv.idx2word(_idx)) w2vec.save_hits(dconf.pretrain_hits_outf) # embeds word_embed1 = voc_word.filter_embed(w2vec, scale=dconf.pretrain_scale) lemma_embed1 = voc_lemma.filter_embed(w2vec, scale=dconf.pretrain_scale) else: voc_word.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) voc_lemma.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) word_embed1 = lemma_embed1 = None # return ret = ZsfpVocabPackage(voc_collections, {"word": word_embed1, "lemma": lemma_embed1}, dconf) return ret
def index_char_seq(self, seq: InputCharSeqField, voc: SimpleVocab, allow_unk: bool): if seq is not None: seq_idxes = [[voc.get_else_unk(c) for c in z] for z in seq.vals] if allow_unk else [[voc[c] for c in z] for z in seq.vals] seq.set_idxes(seq_idxes)
def index_items(self, items, voc: SimpleVocab, allow_unk: bool): for item in items: item.set_label_idx(voc.get_else_unk(item.label) if allow_unk else voc[item.label])
def index_seq(self, seq: SeqField, voc: SimpleVocab, allow_unk: bool): if seq is not None: seq_idxes = [voc.get_else_unk(z) for z in seq.vals] if allow_unk else [voc[z] for z in seq.vals] seq.set_idxes(seq_idxes)