Пример #1
0
 def __init__(self, results: Dict=None, description=None, score: float=None):
     self.results = results if results is not None else {}
     self.description = description
     if score is not None:
         if ResultRecord.RES_KEY in results:
             zwarn(f"RES_KEY already exists, rewrite it: {results[ResultRecord.RES_KEY]} -> {score}")
         results[ResultRecord.RES_KEY] = score
Пример #2
0
 def from_pretrained(conf: ZEncoderBertConf):
     bert_name, cache_dir = conf.bert_model, conf.cache_dir_or_none
     zlog(
         f"Loading pre-trained bert model for ZBert of {bert_name} from {cache_dir}"
     )
     # --
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(bert_name,
                                               cache_dir=cache_dir)
     sub_toker = ZBertSubwordTokenizer(bert_name, tokenizer)
     mtype = {
         "bert": BertModel,
         "roberta": RobertaModel,
         "xlm": XLMRobertaModel
     }[bert_name.split("/")[-1].split("-")[0]]
     if conf.bert_no_pretrain:
         from transformers import AutoConfig
         bert_config = AutoConfig.from_pretrained(bert_name)
         model = mtype(bert_config)
         zwarn("No pretrain-loading for bert, really want this?")
     else:
         model = mtype.from_pretrained(bert_name, cache_dir=cache_dir)
     # --
     if hasattr(model, "pooler"):  # note: delete unused part!
         model.__delattr__("pooler")
     # --
     model.eval()  # note: by default set eval!!
     # --
     zlog(f"Load ok, move to default device {BK.DEFAULT_DEVICE}")
     model.to(BK.DEFAULT_DEVICE)
     zlog("Move ok!")
     return tokenizer, sub_toker, model
Пример #3
0
 def forward(self, med: ZMediator):
     ibatch_seq_info = med.ibatch.seq_info
     # prepare input, truncate if too long
     _input_ids, _input_masks, _input_segids = \
         ibatch_seq_info.enc_input_ids, ibatch_seq_info.enc_input_masks, ibatch_seq_info.enc_input_segids
     _eff_input_ids = med.get_cache('eff_input_ids')  # note: special name!!
     if _eff_input_ids is not None:
         _input_ids = _eff_input_ids
     # --
     if BK.get_shape(_input_ids, -1) > self.tokenizer.model_max_length:
         _full_len = BK.get_shape(_input_ids, -1)
         _max_len = self.tokenizer.model_max_length
         zwarn(
             f"Input too long for bert, truncate it: {BK.get_shape(_input_ids)} => {_max_len}"
         )
         _input_ids, _input_masks, _input_segids = \
             _input_ids[:,:_max_len], _input_masks[:,:_max_len], _input_segids[:,:_max_len]
         # todo(+W+N): how to handle decoders for these cases?
     # forward
     ret = self.bert.forward(_input_ids,
                             _input_masks,
                             _input_segids,
                             med=med)
     # extra
     if self.gcn:
         ret = self.gcn.forward(med)
     # --
     return ret
Пример #4
0
 def reset_parameters(self):  # random reset!
     conf: EmbeddingConf = self.conf
     BK.init_param(self.E, "glorot", lookup=True, scale=conf.init_scale)
     if self.has_npvec_init:
         zwarn(
             "Reset Embedding to random, maybe need to reassign with pre-trained ones?!"
         )
Пример #5
0
 def _prep_sent(self, sent: Sent):
     conf: MySRLConf = self.conf
     slen = len(sent)
     _loss_weight_non = getattr(sent, "_loss_weight_non", 1.)  # todo(+N): special name; loss_weight_non
     # note: for simplicity, assume no loss_weight_non for args
     # first for events
     evt_arr = np.full([slen], 0, dtype=np.int)  # [evt]
     arg_arr = np.full([slen, slen], 0, dtype=np.int)  # [evt, arg]
     evt_items = np.full([slen], None, dtype=object)  # [evt]
     for f in sent.get_frames(conf.evt_ftag):  # note: assume no overlapping
         # predicate
         evt_widx, evt_wlen = self.evt_span_getter(f.mention)
         evt_label = f.label_idx
         assert evt_wlen==1 and evt_label>0, "For simplicity!!"
         evt_items[evt_widx] = f
         evt_arr[evt_widx] = evt_label
         # arguments
         if conf.arg_only_rank1:
             cur_args = [a for a in f.args if a.info.get("rank", 1) == 1]
         else:
             cur_args = f.args
         # bio or not
         if conf.arg_use_bio:  # special
             arg_spans = [self.arg_span_getter(a.mention) + (a.label_idx,) for a in cur_args]
             tag_layers = self.vocab_arg.spans2tags_idx(arg_spans, slen)
             if len(tag_layers) > 1:
                 zwarn(f"Warning: 'Full args require multiple layers with {arg_spans}")
             arg_arr[evt_widx, :] = tag_layers[0][0]  # directly assign it!
         else:  # plain ones
             for a in cur_args:
                 arg_role = a.label_idx
                 arg_widx, arg_wlen = self.arg_span_getter(a.mention)
                 arg_arr[evt_widx, arg_widx:arg_widx+arg_wlen] = arg_role
     return ZObject(sent=sent, slen=slen, loss_weight_non=_loss_weight_non,
                    evt_items=evt_items, evt_arr=evt_arr, arg_arr=arg_arr)
Пример #6
0
 def __init__(self, conf: AttentionPlainConf, **kwargs):
     super().__init__(conf, **kwargs)
     conf: AttentionPlainConf = self.conf
     dim_q, dim_k, dim_v, nh_qk, d_qk, nh_v, d_v = \
         conf.dim_q, conf.dim_k, conf.dim_v, conf.nh_qk, conf.d_qk, conf.nh_v, conf.d_v
     # --
     self._att_scale = math.sqrt(conf.d_qk)  # scale for score
     # pre-att affines (no dropouts here!)
     _eg_q = BK.get_inita_xavier_uniform((d_qk, dim_q)) / BK.get_inita_xavier_uniform((nh_qk*d_qk, dim_q))
     self.affine_q = AffineNode(None, isize=dim_q, osize=nh_qk*d_qk, no_drop=True, init_scale=_eg_q*conf.init_scale_hin)
     _eg_k = BK.get_inita_xavier_uniform((d_qk, dim_k)) / BK.get_inita_xavier_uniform((nh_qk*d_qk, dim_k))
     self.affine_k = AffineNode(None, isize=dim_k, osize=nh_qk*d_qk, no_drop=True, init_scale=_eg_k*conf.init_scale_hin)
     self.affine_v = AffineNode(None, isize=dim_v, osize=nh_v*d_v, no_drop=True)
     # rel dist keys
     self.rposi = RelDistNode(conf.rel, _dim=d_qk) if conf.use_rposi else None
     # att & output
     if conf.useaff_qk2v:
         self.aff_qk2v = AffineNode(None, isize=nh_qk, osize=nh_v)
     else:
         # assert nh_qk == nh_v
         if nh_qk != nh_v:
             zwarn(f"Possible problems with AttNode since hin({nh_qk}) != hout({nh_v})")
     self.adrop = DropoutNode(None, drop_rate=conf.att_drop, fix_drop=False)
     # todo(note): with drops(y) & act(?) & bias(y)?
     self.final_linear = AffineNode(None, isize=nh_v*d_v, osize=dim_v, out_act=conf.out_act)
Пример #7
0
 def collect_tokens(self, char_idx: int, char_len: int):
     # collect all tokens
     index_chars = self.full_char_idxes
     tokens = []
     for ii in range(char_idx, char_idx + char_len):
         vv = index_chars[ii]
         if vv is not None:
             if len(tokens) == 0 or vv != tokens[-1]:  # find a new one
                 assert len(tokens)==0 or (vv[0]==tokens[-1][0] and vv[1]==tokens[-1][1]+1) \
                        or (vv[0]==tokens[-1][0]+1 and vv[1]==0)  # assert continuing span
                 tokens.append(vv)
     # --
     # check
     str0 = ''.join(self.offset_str[char_idx:char_idx + char_len].split())
     str1 = ''.join([
         ''.join(self.sent_tokens[sid][wid].split()) for sid, wid in tokens
     ])
     if str0 not in str1:
         # note: a very strange 'ar' case ...
         if str1 == ''.join(str0.split("_")) or set(str0).difference(
                 set(str1)) == set(chr(1618)):
             zwarn(f"=> Slightly unmatch: {str0} vs {str1}")
         else:
             raise RuntimeError()
     return tokens
Пример #8
0
def pieces2tree(words: List[str], xposes: List[str], parses: List[str]):
    rets = []
    assert len(words) == len(xposes) and len(words) == len(parses)
    for w, xp, pp in zip(words, xposes, parses):
        try:
            p0, p1 = pp.split("*")  # must be two pieces
        except:  # note: this can be caused by empty [word]!
            zwarn(f"Bad parse-bit: {pp}, assume that is '*'")
            p0, p1 = '', ''
            if xp in ["*", "-"]:
                xp = "XX"  # also fix pos
        new_w = []
        for c in w:
            # note: for simplicity, change "{" to "[" to avoid -LCB-, ...
            new_w.append({
                '(': "-LRB-",
                ')': "-RRB-",
                '<': "<",
                '>': ">",
                '[': "<",
                ']': ">",
                '{': "<",
                '}': ">",
                '{': "<",
                '}': ">",
                '〈': "<",
                '〉': ">"
            }.get(c, c))
        if xp == '(': xp = "-LRB-"
        elif xp == ')': xp = "-RRB-"
        rets.append(f"{p0} ({xp} {''.join(new_w)}) {p1}")
    tree_ret = " ".join(rets)
    tree_fix = check_and_fix_tree(tree_ret)
    return tree_fix
Пример #9
0
 def __init__(self,
              conf: PlainInputEmbedderConf,
              voc: SimpleVocab,
              npvec: np.ndarray = None,
              name="UNK"):
     super().__init__(conf, name)
     # --
     conf: PlainInputEmbedderConf = self.conf
     self.voc = voc
     # check init embeddings
     if conf.init_from_pretrain:
         zlog(
             f"Try to init {self.extra_repr()} with npvec.shape={npvec.shape if (npvec is not None) else None}"
         )
         if npvec is None:
             zwarn("warning: cannot get pre-trained embeddings to init!!")
     # get rare unk range
     voc_rare_unk_mask = []
     for w in self.voc.full_i2w:
         c = self.voc.word2count(w, df=None)
         voc_rare_unk_mask.append(
             float(c is not None and c <= conf.rare_unk_thr))
     self.rare_unk_mask = BK.input_real(voc_rare_unk_mask)  # stored tensor!
     # self.register_buffer()  # todo(note): do we need register buffer?
     self.use_rare_unk = (conf.rare_unk_rate > 0. and conf.rare_unk_thr > 0)
     # add the real embedding node
     self.E = EmbeddingNode(conf.econf,
                            npvec=npvec,
                            osize=conf.dim,
                            n_words=len(self.voc))
Пример #10
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf, specified_wset=["test"])
    # load vocab
    t_center.load_vocabs(t_center.conf.vocab_load_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    model.finish_sr()  # note: build sr before possible loading in testing!!
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.model_load_name != "":
        r_center.load(conf.rconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    res = r_center.do_test()
    zlog(f"zzzztestfinal: {res}")
    # --
    zlog("The end of Testing.")
Пример #11
0
 def query(self, sent):
     conf = self.conf
     # --
     key = tuple(self._get_words(sent))
     if (not conf.no_exact_match) and key in self.exact_map:
         return self.exact_map[key]
     else:  # allow fuzzy match with only some char differences!
         good_ones = []  # (sent, (err_word, err_char))
         for cand in self.length_maps.get(len(key), []):
             # --
             if id(cand) in self.fuzzy_hit_ids:
                 continue
             # --
             cand_key = self._get_words(cand)
             assert len(cand_key) == len(key)
             _budget_words = min(conf.fuzzy_seq_wnum,
                                 int(conf.fuzzy_seq_wrate * len(key)))
             is_good = True
             cand_err_word, cand_err_char = 0, 0
             for w1, w2 in zip(cand_key, key):
                 # simple filter
                 if len(w1) - len(w2) > conf.fuzzy_word_cnum:
                     is_good = False
                     break
                 # digits filter
                 if (all(c in self.special_set
                         for c in w1) or all(c in self.special_set
                                             for c in w2)) and (w1 != w2):
                     is_good = False
                     break
                 # --
                 # special distance: only allow same_len diff, prefix, suffix
                 _err = self._get_edit_distance(w1, w2)
                 # --
                 if _err > 0:
                     cand_err_word += 1
                     cand_err_char += _err
                     if _err <= conf.fuzzy_word_cnum:
                         _budget_words -= 1
                         if _budget_words < 0:  # differ too much overall
                             is_good = False
                             break
                     else:  # differ too much in one word
                         is_good = False
                         break
             is_good = (is_good and (_budget_words >= 0))
             if is_good:
                 good_ones.append((cand, (cand_err_word, cand_err_char)))
         # return the least err one!
         if len(good_ones) >= 2:
             zwarn(f"Get multiple options for {key}")
         # return None if len(good_ones)<=0 else min(good_ones, key=lambda x: x[-1])[0]
         # note: only return if there is "the only one"!!
         ret = None if len(good_ones) != 1 else good_ones[0][0]
         if conf.fuzzy_no_repeat_query and ret is not None:
             self.fuzzy_hit_ids.add(id(ret))
         return ret
Пример #12
0
 def __init__(self, conf: PosiEmbeddingConf, **kwargs):
     super().__init__(conf, **kwargs)
     conf: PosiEmbeddingConf = self.conf
     # --
     self.E = BK.new_param([conf.max_val - conf.min_val + 1, conf.osize])
     self.reset_parameters()
     if conf.freeze:
         self.rop.add_fixed_value("trainable", False)
         if not conf.init_sincos:
             zwarn("Meaningless to freeze random posi-embeddings?")
Пример #13
0
 def put(self, sent):
     assert self.simple_searcher is None, "Cannot grow it dynamically!"
     _key = self._get_key(sent)
     # --
     _cands = self.key_cands[_key]
     if not any(s.seq_word.vals == sent.seq_word.vals for s in _cands):
         # no exact matching!
         self.key_cands[_key].append(sent)
         self.all_items.append(sent)
         if len(self.key_cands[_key]) > 1:
             zwarn(
                 f"Sents with same keys: {[z.seq_word for z in self.key_cands[_key]]}"
             )
Пример #14
0
 def _read_coreness_from_nltk(which_fn="fn15"):
     which_fn = {"fn15": 15, '': 15, 'fn17': 17}.get(which_fn, None)
     if which_fn is None:
         zwarn("Cannot read coreness, simply let it be EMPTY!!")
         return {}
     if which_fn == 15:
         from nltk.corpus import framenet15 as nltk_fn
     else:
         from nltk.corpus import framenet as nltk_fn
     # --
     cmap = {}  # FrameName -> {RoleName -> CoreType}
     for frame in nltk_fn.frames():
         cmap[frame.name] = {k:v.coreType for k,v in frame.FE.items()}
     return cmap
Пример #15
0
 def load(self, prefix="./"):
     for name in self.vocabs:
         fname = prefix + "vv_" + name + ".txt"
         if os.path.exists(fname):
             vtype = self.voc_types.get(name, self._default_vocab_type)
             self.vocabs[name] = vtype.read_from_file(fname)
         else:
             zwarn("Cannot find Vocab " + name)
             self.vocabs[name] = None
     for name in self.embeds:
         fname = prefix + "ve_" + name + ".pic"
         if os.path.exists(fname):
             self.embeds[name] = default_pickle_serializer.from_file(fname)
         else:
             self.embeds[name] = None
Пример #16
0
 def check_params(self, ps):
     remaining_keys = set(self.all_params.keys())
     extra_ones = []
     to_check = list(ps)
     for p in to_check:
         p_id = id(p)
         if p_id in remaining_keys:
             remaining_keys.remove(p_id)
         else:
             extra_ones.append((get_shape(p), ))
     missing_ones = [(k, self.all_params[k]) for k in remaining_keys]
     if len(extra_ones)>0:
         zwarn("Check-params extra:" + '\n'.join([str(x) for x in extra_ones]))
     if len(missing_ones)>0:
         zwarn("Check-params missing:" + '\n'.join([str(x) for x in missing_ones]))
Пример #17
0
 def _read_coreness_from_file(file: str):
     frame_map = default_json_serializer.from_file(file)
     cmap = {}  # FrameName -> {RoleName -> CoreType}
     for f, v in frame_map.items():
         assert f not in cmap, f"Err: repeated frame {f}"
         new_map = {}
         for fe in v["FE"]:
             role, core_type = fe["name"], fe["coreType"]
             # assert role not in new_map, f"Err: repeated frame-role {f}:{role}"
             if role in new_map:  # skip this one!
                 zwarn(f"repeated frame-role {f}:{role}")
             else:
                 new_map[role] = core_type
         cmap[f] = new_map
     return cmap
Пример #18
0
 def _parse_mention(self, mention: Dict, doc: Doc) -> Mention:
     # get mention
     main_posi_info = mention.get("posi")
     if main_posi_info is None:
         return None  # no posi info!!
     sid, widx, wlen = self._read_posi(main_posi_info)
     ret = Mention.create(doc.sents[sid], widx, wlen)
     # possible head span?
     head_posi_info = mention.get("head", {}).get("posi")
     if head_posi_info is not None:
         head_sid, head_widx, head_wlen = self._read_posi(head_posi_info)
         if head_sid != sid or not (head_widx>=widx and head_widx+head_wlen<=widx+wlen):
             zwarn(f"Error in head: {head_posi_info} vs. {main_posi_info}")
         else:  # make sure things are correct! otherwise simply discard!!
             ret.set_span(head_widx, head_wlen, hspan=True)
     return ret
Пример #19
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf,
                          specified_wset=[])  # nothing to load here!
    # load vocab
    t_center.load_vocabs(t_center.conf.vocab_load_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    model.finish_sr()  # note: build sr before possible loading in testing!!
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.model_load_name != "":
        r_center.load(conf.rconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    cc = Counter()
    BATCH_LINE = os.environ.get('ZMSP_BATCH_LINE',
                                1000)  # 1000 sents once time
    test_dataset = ZDataset(d_center.conf.testM,
                            'testM',
                            'decode',
                            _no_load=True)  # use testM for other options!
    for lines in yield_lines(sys.stdin, BATCH_LINE):
        insts = [Sent.create(one.split())
                 for one in lines]  # note: simply split as sentence!!
        test_dataset.set_insts(insts)  # directly set it!
        cc["sent"] += len(insts)
        if cc["sent"] % 50000 == 0:
            zlog(f"Decode for {cc}")
        # --
        t_center.prepare_datasets([test_dataset])  # re-prepare!!
        for ibatch in test_dataset.yield_batches(loop=False):
            one_res = model.predict_on_batch(ibatch)
        # --
        for inst in insts:
            sys.stdout.write(
                json.dumps(inst.to_json(), ensure_ascii=False) + "\n")
    # =====
    zlog(f"The end of Decoding: {cc}")
Пример #20
0
def prepare_test(args):
    conf: OverallConf = init_everything(OverallConf(), args)
    dconf, tconf = conf.dconf, conf.tconf
    # vocab
    vpack = ZmtlVocabPackage.build_by_reading(dconf)
    # prepare data
    test_streamer = dconf.R.get_reader(input_path=dconf.test)
    # model
    model = build_model(conf, vpack=vpack)
    if dconf.model_load_name != "":
        model.load(dconf.model_load_name, strict=dconf.model_load_strict)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    # augment with extra embeddings
    extra_embed_files = dconf.test_extra_pretrain_wv_files
    model_emb = model.get_emb()
    if model_emb is not None:
        _embedder = model_emb.eg.get_embedder("word")
        if len(extra_embed_files
               ) > 0 and _embedder is not None:  # has extra_emb and need_emb
            # get embeddings
            extra_embedding = WordVectors.load(extra_embed_files[0])
            extra_embedding.merge_others([
                WordVectors.load(one_file)
                for one_file in extra_embed_files[1:]
            ])
            # get extra dictionary (only those words hit in extra-embed)
            extra_vocab = SimpleVocab.build_by_static(get_extra_hit_words(
                test_streamer, extra_embedding, vpack.get_voc("word")),
                                                      pre_list=None,
                                                      post_list=None)
            # give them to the model
            new_vocab = aug_words_and_embs(_embedder,
                                           vpack.get_voc("word"),
                                           extra_vocab,
                                           extra_embedding,
                                           aug_scale=dconf.pretrain_scale)
            vpack.put_voc("word", new_vocab)
    # =====
    # No Cache!!
    test_inst_preparer = model.get_inst_preper(False)
    test_iter, _ = batch_stream(
        index_stream(test_streamer, vpack, False, False, test_inst_preparer),
        tconf, False)
    return conf, model, vpack, test_iter
Пример #21
0
 def annotate(self, insts: List):
     conf: AnnotatorP2DConf = self.conf
     # --
     # get all sentences and run in batch
     all_sents = list(yield_sents(insts))
     tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn")
     with zopen(tmp_input, 'w') as fd:
         for sent in all_sents:
             fd.write(sent2tree(sent) + "\n")
     # run
     tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu")
     log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else ''
     system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}")
     # read output and add back
     conll_reader_conf = ReaderGetterConf()
     conll_reader_conf.input_conf.use_multiline = True
     conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'"
     conll_reader_conf.input_format = "conllu"
     conll_reader_conf.input_path = tmp_output
     conll_reader = get_reader(conll_reader_conf)
     new_sents = list(conll_reader)
     # --
     assert len(all_sents) == len(new_sents)
     for s0, s1 in zip(all_sents, new_sents):
         assert len(s0) == len(s1)
         mismatched_tokens = [
             (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals)
             if v1 != v2
         ]
         if len(mismatched_tokens) > 0:
             zwarn(
                 f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}"
             )
             if conf.p2d_change_words:
                 s0.build_words(s1.seq_word.vals)  # use the other one!!
             # breakpoint()
         # note: build again!
         s0.build_dep_tree(s1.tree_dep.seq_head.vals, [
             self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals
         ])
         if conf.p2d_use_xpos:
             trg_pos_list = s1.info.get["xpos"]
         else:
             trg_pos_list = s1.seq_upos.vals
         s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
Пример #22
0
 def __init__(self, conf: EmbeddingConf, npvec=None, **kwargs):
     super().__init__(conf, **kwargs)
     conf: EmbeddingConf = self.conf
     # --
     n_words, n_dim, freeze = conf.n_words, conf.osize, conf.freeze
     # --
     self.has_npvec_init = False
     if npvec is None:  # no init
         self.E = BK.new_param([n_words, n_dim])
         self.reset_parameters()
     else:
         self.has_npvec_init = True
         assert conf.n_words == len(npvec)
         self.reset_with_npvec(npvec)
     if freeze:
         self.rop.add_fixed_value("trainable", False)
         if npvec is None:
             zwarn("Meaningless to freeze random embeddings?")
Пример #23
0
def check_and_fix_tree(s: str):
    cur_depth = 0
    hit_zero = 0
    for c in s:
        if c == "(":
            cur_depth += 1
        elif c == ")":
            cur_depth -= 1
        else:
            continue  # ignore others!
        assert cur_depth >= 0
        if cur_depth == 0:
            hit_zero += 1
    assert cur_depth == 0
    if hit_zero != 1:
        zwarn(f"Strange tree pieces={hit_zero}: {s}")
        return f"(S {s} )"  # simple fix
    else:
        return s
Пример #24
0
 def build_constraint_arrs(m: Dict[str, Union[List[str], Dict]],
                           voc_trg,
                           voc_src=None,
                           warning=True):
     # first build targets
     trg_len = len(voc_trg)
     arr_m = {}
     cc = Counter()
     for s, ts in m.items():
         trg_arr = np.zeros(trg_len, dtype=np.float32)
         hit_t = 0
         for t in ts:  # ts can be either List[str] or Dict[str,??]
             t_idx = voc_trg.get(t)
             if t_idx is not None:
                 trg_arr[t_idx] = 1.
                 hit_t += 1
             else:
                 cc["miss_t"] += 1  # miss one t
         if hit_t == 0:
             if warning:
                 zwarn(f"No trgs for src: {s}({ts})")
             cc["miss_ts"] += 1  # miss full ts
         arr_m[s] = trg_arr
     # then for src if providing voc
     if voc_src is None:
         zlog(
             f"Build constraint_arrs with trg: {len(arr_m)} x {trg_len}; {cc}"
         )
         return arr_m
     else:
         arr_m2 = np.zeros([len(voc_src), trg_len], dtype=np.float32)
         hit_s = 0
         for s, arr in arr_m.items():
             s_idx = voc_src.get(s)
             if s_idx is not None:
                 arr_m2[s_idx] = arr
                 hit_s += 1
             else:
                 cc["miss_s"] += 1
         zlog(
             f"Build constraint_arrs with src/trg: {arr_m2.shape}; hit={hit_s}/{len(arr_m)}={hit_s/len(arr_m):.4f}; {cc}"
         )
         return arr_m2
Пример #25
0
 def _load_txt(fname: str, sep=" "):
     zlog(f"Going to load pre-trained (txt) w2v from {fname} ...")
     repeated_count = 0
     words, vecs = [], []
     word_set = set()
     num_words, embed_size = None, None
     with zopen(fname) as fd:
         # first line
         line = fd.readline()
         try:
             num_words, embed_size = [int(x) for x in line.split(sep)]
             zlog(
                 f"Reading w2v num_words={num_words}, embed_size={embed_size}."
             )
             line = fd.readline()
         except:
             zlog("Reading w2v.")
         # the rest
         while len(line) > 0:
             fields = line.rstrip().split(sep)
             word, vec = fields[0], [float(x) for x in fields[1:]]
             if word in word_set:
                 repeated_count += 1
                 zwarn(f"Repeat key {word}")
             else:  # only add the first one
                 words.append(word)
                 vecs.append(vec)
                 word_set.add(word)
             # put embed_size
             if embed_size is None:
                 embed_size = len(vec)
             else:
                 assert len(vec) == embed_size, "Unmatched embed dimension."
             line = fd.readline()
     if num_words is not None:
         assert num_words == len(vecs) + repeated_count
     num_words = len(vecs)
     # final
     zlog(
         f"Read ok: w2v num_words={num_words}, embed_size={embed_size}, repeat={repeated_count}"
     )
     return WordVectors(words, vecs)
Пример #26
0
 def load_progress(self, file: str, forward_stream=False):
     old_uidx = self.tp.uidx
     d = default_json_serializer.from_file(file)
     self.tp.from_json(d)
     if forward_stream:
         if old_uidx > self.tp.uidx:
             zwarn(
                 f"Cannot go to the past: {old_uidx} -> {self.tp.uidx}, skip this!"
             )
         else:
             _s = self.train_stream
             for _ in range(self.tp.uidx - old_uidx):
                 _, _eos = _s.next_and_check()
                 if _eos:  # restart and get one
                     _s.restart()
                     _s.next()
             zlog(f"Forward to the future: {old_uidx} -> {self.tp.uidx}!",
                  func="io")
     zlog(f"Load training progress from {file}", func="io")
     self.adjust_scheduled_values()  # also adjust values!
Пример #27
0
 def get_label_mask(self, sels: List[str]):
     expand_sels = []
     for s in sels:
         if s in UD_CATEGORIES:
             expand_sels.extend(UD_CATEGORIES[s])
         else:
             expand_sels.append(s)
     expand_sels = sorted(set(expand_sels))
     voc = self.voc
     # --
     ret = np.zeros(len(voc))
     _cc = 0
     for s in expand_sels:
         if s in voc:
             ret[voc[s]] = 1.
             _cc += voc.word2count(s)
         else:
             zwarn(f"UNK dep label: {s}")
     _all_cc = voc.get_all_counts()
     zlog(f"Get label mask with {expand_sels}: {len(expand_sels)}=={ret.sum().item()} -> {_cc}/{_all_cc}={_cc/(_all_cc+1e-5)}")
     return BK.input_real(ret)
Пример #28
0
 def _prep_items(self, items: List, par: object, seq_len: int):
     vocab: SeqVocab = self.vocab
     # --
     core_spans = [
         self.core_span_getter(f.mention) + (f.label_idx, ) for f in items
     ]
     _loss_weight_non = getattr(
         par, "_loss_weight_non",
         1.)  # todo(+N): special name; loss_weight_non
     tag_layers = vocab.spans2tags_idx(core_spans, seq_len)
     if len(tag_layers) > 1:
         zwarn(
             f"Warning: '{self.conf.ftag}' only use layer0 but the full needs multiple layers with {core_spans}"
         )
         # breakpoint()
     trg_tags = tag_layers[0][0]
     # trg_first_items = [(items[i] if i>=0 else None) for i in tag_layers[0][1]]  # note: put it at the start!
     # return ZObject(loss_weight_non=_loss_weight_non, first_items=trg_first_items, tags=trg_tags, len=len(trg_tags))
     return ZObject(loss_weight_non=_loss_weight_non,
                    tags=trg_tags,
                    len=len(trg_tags))
Пример #29
0
 def __init__(self, conf: TransformerConf, **kwargs):
     super().__init__(conf, **kwargs)
     # --
     conf: TransformerConf = self.conf
     self.tnodes = []
     for i in range(conf.n_layers):
         one_node = _OneTSFNode(conf)
         self.add_module(f"T{i}", one_node)
         self.tnodes.append(one_node)
     # add posi embeddings
     # (note: nope!!) self.scale_when_add_posi = math.sqrt(conf.d_model)
     if conf.use_posi:
         self.PE = PosiEmbeddingNode(conf.pconf, osize=conf.d_model)
     # input f
     if conf.wconf.strategy == "addnorm":
         self.input_f = LayerNormNode(None, osize=conf.d_model)
     elif conf.wconf.strategy == "addact":
         self.input_f = ActivationHelper.get_act(conf.wconf.act)
     else:
         zwarn("No calculations for input in TransformerEncoder!!")
         self.input_f = lambda x: x
Пример #30
0
 def __init__(self, conf: TransformerDecConf, **kwargs):
     super().__init__(conf, **kwargs)
     # --
     conf: TransformerDecConf = self.conf
     assert conf.step_dim == 1, "Transformer assumes [bsize, len, D]!!"
     self.tnodes = []
     for i in range(conf.n_layers):
         one_node = _OneTSFDecNode(conf)
         self.add_module(f"T{i}", one_node)
         self.tnodes.append(one_node)
     # add posi embeddings
     if conf.use_posi:
         self.PE = PosiEmbeddingNode(conf.pconf, osize=conf.d_model)
     # input f
     if conf.wconf.strategy == "addnorm":
         self.input_f = LayerNormNode(None, osize=conf.d_model)
     elif conf.wconf.strategy == "addact":
         self.input_f = ActivationHelper.get_act(conf.wconf.act)
     else:
         zwarn("No calculations for input in TransformerEncoder!!")
         self.input_f = lambda x: x