def __init__(self, results: Dict=None, description=None, score: float=None): self.results = results if results is not None else {} self.description = description if score is not None: if ResultRecord.RES_KEY in results: zwarn(f"RES_KEY already exists, rewrite it: {results[ResultRecord.RES_KEY]} -> {score}") results[ResultRecord.RES_KEY] = score
def from_pretrained(conf: ZEncoderBertConf): bert_name, cache_dir = conf.bert_model, conf.cache_dir_or_none zlog( f"Loading pre-trained bert model for ZBert of {bert_name} from {cache_dir}" ) # -- from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(bert_name, cache_dir=cache_dir) sub_toker = ZBertSubwordTokenizer(bert_name, tokenizer) mtype = { "bert": BertModel, "roberta": RobertaModel, "xlm": XLMRobertaModel }[bert_name.split("/")[-1].split("-")[0]] if conf.bert_no_pretrain: from transformers import AutoConfig bert_config = AutoConfig.from_pretrained(bert_name) model = mtype(bert_config) zwarn("No pretrain-loading for bert, really want this?") else: model = mtype.from_pretrained(bert_name, cache_dir=cache_dir) # -- if hasattr(model, "pooler"): # note: delete unused part! model.__delattr__("pooler") # -- model.eval() # note: by default set eval!! # -- zlog(f"Load ok, move to default device {BK.DEFAULT_DEVICE}") model.to(BK.DEFAULT_DEVICE) zlog("Move ok!") return tokenizer, sub_toker, model
def forward(self, med: ZMediator): ibatch_seq_info = med.ibatch.seq_info # prepare input, truncate if too long _input_ids, _input_masks, _input_segids = \ ibatch_seq_info.enc_input_ids, ibatch_seq_info.enc_input_masks, ibatch_seq_info.enc_input_segids _eff_input_ids = med.get_cache('eff_input_ids') # note: special name!! if _eff_input_ids is not None: _input_ids = _eff_input_ids # -- if BK.get_shape(_input_ids, -1) > self.tokenizer.model_max_length: _full_len = BK.get_shape(_input_ids, -1) _max_len = self.tokenizer.model_max_length zwarn( f"Input too long for bert, truncate it: {BK.get_shape(_input_ids)} => {_max_len}" ) _input_ids, _input_masks, _input_segids = \ _input_ids[:,:_max_len], _input_masks[:,:_max_len], _input_segids[:,:_max_len] # todo(+W+N): how to handle decoders for these cases? # forward ret = self.bert.forward(_input_ids, _input_masks, _input_segids, med=med) # extra if self.gcn: ret = self.gcn.forward(med) # -- return ret
def reset_parameters(self): # random reset! conf: EmbeddingConf = self.conf BK.init_param(self.E, "glorot", lookup=True, scale=conf.init_scale) if self.has_npvec_init: zwarn( "Reset Embedding to random, maybe need to reassign with pre-trained ones?!" )
def _prep_sent(self, sent: Sent): conf: MySRLConf = self.conf slen = len(sent) _loss_weight_non = getattr(sent, "_loss_weight_non", 1.) # todo(+N): special name; loss_weight_non # note: for simplicity, assume no loss_weight_non for args # first for events evt_arr = np.full([slen], 0, dtype=np.int) # [evt] arg_arr = np.full([slen, slen], 0, dtype=np.int) # [evt, arg] evt_items = np.full([slen], None, dtype=object) # [evt] for f in sent.get_frames(conf.evt_ftag): # note: assume no overlapping # predicate evt_widx, evt_wlen = self.evt_span_getter(f.mention) evt_label = f.label_idx assert evt_wlen==1 and evt_label>0, "For simplicity!!" evt_items[evt_widx] = f evt_arr[evt_widx] = evt_label # arguments if conf.arg_only_rank1: cur_args = [a for a in f.args if a.info.get("rank", 1) == 1] else: cur_args = f.args # bio or not if conf.arg_use_bio: # special arg_spans = [self.arg_span_getter(a.mention) + (a.label_idx,) for a in cur_args] tag_layers = self.vocab_arg.spans2tags_idx(arg_spans, slen) if len(tag_layers) > 1: zwarn(f"Warning: 'Full args require multiple layers with {arg_spans}") arg_arr[evt_widx, :] = tag_layers[0][0] # directly assign it! else: # plain ones for a in cur_args: arg_role = a.label_idx arg_widx, arg_wlen = self.arg_span_getter(a.mention) arg_arr[evt_widx, arg_widx:arg_widx+arg_wlen] = arg_role return ZObject(sent=sent, slen=slen, loss_weight_non=_loss_weight_non, evt_items=evt_items, evt_arr=evt_arr, arg_arr=arg_arr)
def __init__(self, conf: AttentionPlainConf, **kwargs): super().__init__(conf, **kwargs) conf: AttentionPlainConf = self.conf dim_q, dim_k, dim_v, nh_qk, d_qk, nh_v, d_v = \ conf.dim_q, conf.dim_k, conf.dim_v, conf.nh_qk, conf.d_qk, conf.nh_v, conf.d_v # -- self._att_scale = math.sqrt(conf.d_qk) # scale for score # pre-att affines (no dropouts here!) _eg_q = BK.get_inita_xavier_uniform((d_qk, dim_q)) / BK.get_inita_xavier_uniform((nh_qk*d_qk, dim_q)) self.affine_q = AffineNode(None, isize=dim_q, osize=nh_qk*d_qk, no_drop=True, init_scale=_eg_q*conf.init_scale_hin) _eg_k = BK.get_inita_xavier_uniform((d_qk, dim_k)) / BK.get_inita_xavier_uniform((nh_qk*d_qk, dim_k)) self.affine_k = AffineNode(None, isize=dim_k, osize=nh_qk*d_qk, no_drop=True, init_scale=_eg_k*conf.init_scale_hin) self.affine_v = AffineNode(None, isize=dim_v, osize=nh_v*d_v, no_drop=True) # rel dist keys self.rposi = RelDistNode(conf.rel, _dim=d_qk) if conf.use_rposi else None # att & output if conf.useaff_qk2v: self.aff_qk2v = AffineNode(None, isize=nh_qk, osize=nh_v) else: # assert nh_qk == nh_v if nh_qk != nh_v: zwarn(f"Possible problems with AttNode since hin({nh_qk}) != hout({nh_v})") self.adrop = DropoutNode(None, drop_rate=conf.att_drop, fix_drop=False) # todo(note): with drops(y) & act(?) & bias(y)? self.final_linear = AffineNode(None, isize=nh_v*d_v, osize=dim_v, out_act=conf.out_act)
def collect_tokens(self, char_idx: int, char_len: int): # collect all tokens index_chars = self.full_char_idxes tokens = [] for ii in range(char_idx, char_idx + char_len): vv = index_chars[ii] if vv is not None: if len(tokens) == 0 or vv != tokens[-1]: # find a new one assert len(tokens)==0 or (vv[0]==tokens[-1][0] and vv[1]==tokens[-1][1]+1) \ or (vv[0]==tokens[-1][0]+1 and vv[1]==0) # assert continuing span tokens.append(vv) # -- # check str0 = ''.join(self.offset_str[char_idx:char_idx + char_len].split()) str1 = ''.join([ ''.join(self.sent_tokens[sid][wid].split()) for sid, wid in tokens ]) if str0 not in str1: # note: a very strange 'ar' case ... if str1 == ''.join(str0.split("_")) or set(str0).difference( set(str1)) == set(chr(1618)): zwarn(f"=> Slightly unmatch: {str0} vs {str1}") else: raise RuntimeError() return tokens
def pieces2tree(words: List[str], xposes: List[str], parses: List[str]): rets = [] assert len(words) == len(xposes) and len(words) == len(parses) for w, xp, pp in zip(words, xposes, parses): try: p0, p1 = pp.split("*") # must be two pieces except: # note: this can be caused by empty [word]! zwarn(f"Bad parse-bit: {pp}, assume that is '*'") p0, p1 = '', '' if xp in ["*", "-"]: xp = "XX" # also fix pos new_w = [] for c in w: # note: for simplicity, change "{" to "[" to avoid -LCB-, ... new_w.append({ '(': "-LRB-", ')': "-RRB-", '<': "<", '>': ">", '[': "<", ']': ">", '{': "<", '}': ">", '{': "<", '}': ">", '〈': "<", '〉': ">" }.get(c, c)) if xp == '(': xp = "-LRB-" elif xp == ')': xp = "-RRB-" rets.append(f"{p0} ({xp} {''.join(new_w)}) {p1}") tree_ret = " ".join(rets) tree_fix = check_and_fix_tree(tree_ret) return tree_fix
def __init__(self, conf: PlainInputEmbedderConf, voc: SimpleVocab, npvec: np.ndarray = None, name="UNK"): super().__init__(conf, name) # -- conf: PlainInputEmbedderConf = self.conf self.voc = voc # check init embeddings if conf.init_from_pretrain: zlog( f"Try to init {self.extra_repr()} with npvec.shape={npvec.shape if (npvec is not None) else None}" ) if npvec is None: zwarn("warning: cannot get pre-trained embeddings to init!!") # get rare unk range voc_rare_unk_mask = [] for w in self.voc.full_i2w: c = self.voc.word2count(w, df=None) voc_rare_unk_mask.append( float(c is not None and c <= conf.rare_unk_thr)) self.rare_unk_mask = BK.input_real(voc_rare_unk_mask) # stored tensor! # self.register_buffer() # todo(note): do we need register buffer? self.use_rare_unk = (conf.rare_unk_rate > 0. and conf.rare_unk_thr > 0) # add the real embedding node self.E = EmbeddingNode(conf.econf, npvec=npvec, osize=conf.dim, n_words=len(self.voc))
def main(args): # conf conf: ZOverallConf = init_everything(ZOverallConf(), args) # task t_center = TaskCenter(conf.tconf) # data d_center = DataCenter(conf.dconf, specified_wset=["test"]) # load vocab t_center.load_vocabs(t_center.conf.vocab_load_dir) # prepare datasets t_center.prepare_datasets(d_center.get_datasets()) # build model model = ZModel(conf.mconf) t_center.build_mods(model) model.finish_sr() # note: build sr before possible loading in testing!! # run r_center = RunCenter(conf.rconf, model, t_center, d_center) if conf.rconf.model_load_name != "": r_center.load(conf.rconf.model_load_name) else: zwarn("No model to load, Debugging mode??") res = r_center.do_test() zlog(f"zzzztestfinal: {res}") # -- zlog("The end of Testing.")
def query(self, sent): conf = self.conf # -- key = tuple(self._get_words(sent)) if (not conf.no_exact_match) and key in self.exact_map: return self.exact_map[key] else: # allow fuzzy match with only some char differences! good_ones = [] # (sent, (err_word, err_char)) for cand in self.length_maps.get(len(key), []): # -- if id(cand) in self.fuzzy_hit_ids: continue # -- cand_key = self._get_words(cand) assert len(cand_key) == len(key) _budget_words = min(conf.fuzzy_seq_wnum, int(conf.fuzzy_seq_wrate * len(key))) is_good = True cand_err_word, cand_err_char = 0, 0 for w1, w2 in zip(cand_key, key): # simple filter if len(w1) - len(w2) > conf.fuzzy_word_cnum: is_good = False break # digits filter if (all(c in self.special_set for c in w1) or all(c in self.special_set for c in w2)) and (w1 != w2): is_good = False break # -- # special distance: only allow same_len diff, prefix, suffix _err = self._get_edit_distance(w1, w2) # -- if _err > 0: cand_err_word += 1 cand_err_char += _err if _err <= conf.fuzzy_word_cnum: _budget_words -= 1 if _budget_words < 0: # differ too much overall is_good = False break else: # differ too much in one word is_good = False break is_good = (is_good and (_budget_words >= 0)) if is_good: good_ones.append((cand, (cand_err_word, cand_err_char))) # return the least err one! if len(good_ones) >= 2: zwarn(f"Get multiple options for {key}") # return None if len(good_ones)<=0 else min(good_ones, key=lambda x: x[-1])[0] # note: only return if there is "the only one"!! ret = None if len(good_ones) != 1 else good_ones[0][0] if conf.fuzzy_no_repeat_query and ret is not None: self.fuzzy_hit_ids.add(id(ret)) return ret
def __init__(self, conf: PosiEmbeddingConf, **kwargs): super().__init__(conf, **kwargs) conf: PosiEmbeddingConf = self.conf # -- self.E = BK.new_param([conf.max_val - conf.min_val + 1, conf.osize]) self.reset_parameters() if conf.freeze: self.rop.add_fixed_value("trainable", False) if not conf.init_sincos: zwarn("Meaningless to freeze random posi-embeddings?")
def put(self, sent): assert self.simple_searcher is None, "Cannot grow it dynamically!" _key = self._get_key(sent) # -- _cands = self.key_cands[_key] if not any(s.seq_word.vals == sent.seq_word.vals for s in _cands): # no exact matching! self.key_cands[_key].append(sent) self.all_items.append(sent) if len(self.key_cands[_key]) > 1: zwarn( f"Sents with same keys: {[z.seq_word for z in self.key_cands[_key]]}" )
def _read_coreness_from_nltk(which_fn="fn15"): which_fn = {"fn15": 15, '': 15, 'fn17': 17}.get(which_fn, None) if which_fn is None: zwarn("Cannot read coreness, simply let it be EMPTY!!") return {} if which_fn == 15: from nltk.corpus import framenet15 as nltk_fn else: from nltk.corpus import framenet as nltk_fn # -- cmap = {} # FrameName -> {RoleName -> CoreType} for frame in nltk_fn.frames(): cmap[frame.name] = {k:v.coreType for k,v in frame.FE.items()} return cmap
def load(self, prefix="./"): for name in self.vocabs: fname = prefix + "vv_" + name + ".txt" if os.path.exists(fname): vtype = self.voc_types.get(name, self._default_vocab_type) self.vocabs[name] = vtype.read_from_file(fname) else: zwarn("Cannot find Vocab " + name) self.vocabs[name] = None for name in self.embeds: fname = prefix + "ve_" + name + ".pic" if os.path.exists(fname): self.embeds[name] = default_pickle_serializer.from_file(fname) else: self.embeds[name] = None
def check_params(self, ps): remaining_keys = set(self.all_params.keys()) extra_ones = [] to_check = list(ps) for p in to_check: p_id = id(p) if p_id in remaining_keys: remaining_keys.remove(p_id) else: extra_ones.append((get_shape(p), )) missing_ones = [(k, self.all_params[k]) for k in remaining_keys] if len(extra_ones)>0: zwarn("Check-params extra:" + '\n'.join([str(x) for x in extra_ones])) if len(missing_ones)>0: zwarn("Check-params missing:" + '\n'.join([str(x) for x in missing_ones]))
def _read_coreness_from_file(file: str): frame_map = default_json_serializer.from_file(file) cmap = {} # FrameName -> {RoleName -> CoreType} for f, v in frame_map.items(): assert f not in cmap, f"Err: repeated frame {f}" new_map = {} for fe in v["FE"]: role, core_type = fe["name"], fe["coreType"] # assert role not in new_map, f"Err: repeated frame-role {f}:{role}" if role in new_map: # skip this one! zwarn(f"repeated frame-role {f}:{role}") else: new_map[role] = core_type cmap[f] = new_map return cmap
def _parse_mention(self, mention: Dict, doc: Doc) -> Mention: # get mention main_posi_info = mention.get("posi") if main_posi_info is None: return None # no posi info!! sid, widx, wlen = self._read_posi(main_posi_info) ret = Mention.create(doc.sents[sid], widx, wlen) # possible head span? head_posi_info = mention.get("head", {}).get("posi") if head_posi_info is not None: head_sid, head_widx, head_wlen = self._read_posi(head_posi_info) if head_sid != sid or not (head_widx>=widx and head_widx+head_wlen<=widx+wlen): zwarn(f"Error in head: {head_posi_info} vs. {main_posi_info}") else: # make sure things are correct! otherwise simply discard!! ret.set_span(head_widx, head_wlen, hspan=True) return ret
def main(args): # conf conf: ZOverallConf = init_everything(ZOverallConf(), args) # task t_center = TaskCenter(conf.tconf) # data d_center = DataCenter(conf.dconf, specified_wset=[]) # nothing to load here! # load vocab t_center.load_vocabs(t_center.conf.vocab_load_dir) # prepare datasets t_center.prepare_datasets(d_center.get_datasets()) # build model model = ZModel(conf.mconf) t_center.build_mods(model) model.finish_sr() # note: build sr before possible loading in testing!! # run r_center = RunCenter(conf.rconf, model, t_center, d_center) if conf.rconf.model_load_name != "": r_center.load(conf.rconf.model_load_name) else: zwarn("No model to load, Debugging mode??") # ===== cc = Counter() BATCH_LINE = os.environ.get('ZMSP_BATCH_LINE', 1000) # 1000 sents once time test_dataset = ZDataset(d_center.conf.testM, 'testM', 'decode', _no_load=True) # use testM for other options! for lines in yield_lines(sys.stdin, BATCH_LINE): insts = [Sent.create(one.split()) for one in lines] # note: simply split as sentence!! test_dataset.set_insts(insts) # directly set it! cc["sent"] += len(insts) if cc["sent"] % 50000 == 0: zlog(f"Decode for {cc}") # -- t_center.prepare_datasets([test_dataset]) # re-prepare!! for ibatch in test_dataset.yield_batches(loop=False): one_res = model.predict_on_batch(ibatch) # -- for inst in insts: sys.stdout.write( json.dumps(inst.to_json(), ensure_ascii=False) + "\n") # ===== zlog(f"The end of Decoding: {cc}")
def prepare_test(args): conf: OverallConf = init_everything(OverallConf(), args) dconf, tconf = conf.dconf, conf.tconf # vocab vpack = ZmtlVocabPackage.build_by_reading(dconf) # prepare data test_streamer = dconf.R.get_reader(input_path=dconf.test) # model model = build_model(conf, vpack=vpack) if dconf.model_load_name != "": model.load(dconf.model_load_name, strict=dconf.model_load_strict) else: zwarn("No model to load, Debugging mode??") # ===== # augment with extra embeddings extra_embed_files = dconf.test_extra_pretrain_wv_files model_emb = model.get_emb() if model_emb is not None: _embedder = model_emb.eg.get_embedder("word") if len(extra_embed_files ) > 0 and _embedder is not None: # has extra_emb and need_emb # get embeddings extra_embedding = WordVectors.load(extra_embed_files[0]) extra_embedding.merge_others([ WordVectors.load(one_file) for one_file in extra_embed_files[1:] ]) # get extra dictionary (only those words hit in extra-embed) extra_vocab = SimpleVocab.build_by_static(get_extra_hit_words( test_streamer, extra_embedding, vpack.get_voc("word")), pre_list=None, post_list=None) # give them to the model new_vocab = aug_words_and_embs(_embedder, vpack.get_voc("word"), extra_vocab, extra_embedding, aug_scale=dconf.pretrain_scale) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter, _ = batch_stream( index_stream(test_streamer, vpack, False, False, test_inst_preparer), tconf, False) return conf, model, vpack, test_iter
def annotate(self, insts: List): conf: AnnotatorP2DConf = self.conf # -- # get all sentences and run in batch all_sents = list(yield_sents(insts)) tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn") with zopen(tmp_input, 'w') as fd: for sent in all_sents: fd.write(sent2tree(sent) + "\n") # run tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu") log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else '' system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}") # read output and add back conll_reader_conf = ReaderGetterConf() conll_reader_conf.input_conf.use_multiline = True conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'" conll_reader_conf.input_format = "conllu" conll_reader_conf.input_path = tmp_output conll_reader = get_reader(conll_reader_conf) new_sents = list(conll_reader) # -- assert len(all_sents) == len(new_sents) for s0, s1 in zip(all_sents, new_sents): assert len(s0) == len(s1) mismatched_tokens = [ (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals) if v1 != v2 ] if len(mismatched_tokens) > 0: zwarn( f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}" ) if conf.p2d_change_words: s0.build_words(s1.seq_word.vals) # use the other one!! # breakpoint() # note: build again! s0.build_dep_tree(s1.tree_dep.seq_head.vals, [ self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals ]) if conf.p2d_use_xpos: trg_pos_list = s1.info.get["xpos"] else: trg_pos_list = s1.seq_upos.vals s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
def __init__(self, conf: EmbeddingConf, npvec=None, **kwargs): super().__init__(conf, **kwargs) conf: EmbeddingConf = self.conf # -- n_words, n_dim, freeze = conf.n_words, conf.osize, conf.freeze # -- self.has_npvec_init = False if npvec is None: # no init self.E = BK.new_param([n_words, n_dim]) self.reset_parameters() else: self.has_npvec_init = True assert conf.n_words == len(npvec) self.reset_with_npvec(npvec) if freeze: self.rop.add_fixed_value("trainable", False) if npvec is None: zwarn("Meaningless to freeze random embeddings?")
def check_and_fix_tree(s: str): cur_depth = 0 hit_zero = 0 for c in s: if c == "(": cur_depth += 1 elif c == ")": cur_depth -= 1 else: continue # ignore others! assert cur_depth >= 0 if cur_depth == 0: hit_zero += 1 assert cur_depth == 0 if hit_zero != 1: zwarn(f"Strange tree pieces={hit_zero}: {s}") return f"(S {s} )" # simple fix else: return s
def build_constraint_arrs(m: Dict[str, Union[List[str], Dict]], voc_trg, voc_src=None, warning=True): # first build targets trg_len = len(voc_trg) arr_m = {} cc = Counter() for s, ts in m.items(): trg_arr = np.zeros(trg_len, dtype=np.float32) hit_t = 0 for t in ts: # ts can be either List[str] or Dict[str,??] t_idx = voc_trg.get(t) if t_idx is not None: trg_arr[t_idx] = 1. hit_t += 1 else: cc["miss_t"] += 1 # miss one t if hit_t == 0: if warning: zwarn(f"No trgs for src: {s}({ts})") cc["miss_ts"] += 1 # miss full ts arr_m[s] = trg_arr # then for src if providing voc if voc_src is None: zlog( f"Build constraint_arrs with trg: {len(arr_m)} x {trg_len}; {cc}" ) return arr_m else: arr_m2 = np.zeros([len(voc_src), trg_len], dtype=np.float32) hit_s = 0 for s, arr in arr_m.items(): s_idx = voc_src.get(s) if s_idx is not None: arr_m2[s_idx] = arr hit_s += 1 else: cc["miss_s"] += 1 zlog( f"Build constraint_arrs with src/trg: {arr_m2.shape}; hit={hit_s}/{len(arr_m)}={hit_s/len(arr_m):.4f}; {cc}" ) return arr_m2
def _load_txt(fname: str, sep=" "): zlog(f"Going to load pre-trained (txt) w2v from {fname} ...") repeated_count = 0 words, vecs = [], [] word_set = set() num_words, embed_size = None, None with zopen(fname) as fd: # first line line = fd.readline() try: num_words, embed_size = [int(x) for x in line.split(sep)] zlog( f"Reading w2v num_words={num_words}, embed_size={embed_size}." ) line = fd.readline() except: zlog("Reading w2v.") # the rest while len(line) > 0: fields = line.rstrip().split(sep) word, vec = fields[0], [float(x) for x in fields[1:]] if word in word_set: repeated_count += 1 zwarn(f"Repeat key {word}") else: # only add the first one words.append(word) vecs.append(vec) word_set.add(word) # put embed_size if embed_size is None: embed_size = len(vec) else: assert len(vec) == embed_size, "Unmatched embed dimension." line = fd.readline() if num_words is not None: assert num_words == len(vecs) + repeated_count num_words = len(vecs) # final zlog( f"Read ok: w2v num_words={num_words}, embed_size={embed_size}, repeat={repeated_count}" ) return WordVectors(words, vecs)
def load_progress(self, file: str, forward_stream=False): old_uidx = self.tp.uidx d = default_json_serializer.from_file(file) self.tp.from_json(d) if forward_stream: if old_uidx > self.tp.uidx: zwarn( f"Cannot go to the past: {old_uidx} -> {self.tp.uidx}, skip this!" ) else: _s = self.train_stream for _ in range(self.tp.uidx - old_uidx): _, _eos = _s.next_and_check() if _eos: # restart and get one _s.restart() _s.next() zlog(f"Forward to the future: {old_uidx} -> {self.tp.uidx}!", func="io") zlog(f"Load training progress from {file}", func="io") self.adjust_scheduled_values() # also adjust values!
def get_label_mask(self, sels: List[str]): expand_sels = [] for s in sels: if s in UD_CATEGORIES: expand_sels.extend(UD_CATEGORIES[s]) else: expand_sels.append(s) expand_sels = sorted(set(expand_sels)) voc = self.voc # -- ret = np.zeros(len(voc)) _cc = 0 for s in expand_sels: if s in voc: ret[voc[s]] = 1. _cc += voc.word2count(s) else: zwarn(f"UNK dep label: {s}") _all_cc = voc.get_all_counts() zlog(f"Get label mask with {expand_sels}: {len(expand_sels)}=={ret.sum().item()} -> {_cc}/{_all_cc}={_cc/(_all_cc+1e-5)}") return BK.input_real(ret)
def _prep_items(self, items: List, par: object, seq_len: int): vocab: SeqVocab = self.vocab # -- core_spans = [ self.core_span_getter(f.mention) + (f.label_idx, ) for f in items ] _loss_weight_non = getattr( par, "_loss_weight_non", 1.) # todo(+N): special name; loss_weight_non tag_layers = vocab.spans2tags_idx(core_spans, seq_len) if len(tag_layers) > 1: zwarn( f"Warning: '{self.conf.ftag}' only use layer0 but the full needs multiple layers with {core_spans}" ) # breakpoint() trg_tags = tag_layers[0][0] # trg_first_items = [(items[i] if i>=0 else None) for i in tag_layers[0][1]] # note: put it at the start! # return ZObject(loss_weight_non=_loss_weight_non, first_items=trg_first_items, tags=trg_tags, len=len(trg_tags)) return ZObject(loss_weight_non=_loss_weight_non, tags=trg_tags, len=len(trg_tags))
def __init__(self, conf: TransformerConf, **kwargs): super().__init__(conf, **kwargs) # -- conf: TransformerConf = self.conf self.tnodes = [] for i in range(conf.n_layers): one_node = _OneTSFNode(conf) self.add_module(f"T{i}", one_node) self.tnodes.append(one_node) # add posi embeddings # (note: nope!!) self.scale_when_add_posi = math.sqrt(conf.d_model) if conf.use_posi: self.PE = PosiEmbeddingNode(conf.pconf, osize=conf.d_model) # input f if conf.wconf.strategy == "addnorm": self.input_f = LayerNormNode(None, osize=conf.d_model) elif conf.wconf.strategy == "addact": self.input_f = ActivationHelper.get_act(conf.wconf.act) else: zwarn("No calculations for input in TransformerEncoder!!") self.input_f = lambda x: x
def __init__(self, conf: TransformerDecConf, **kwargs): super().__init__(conf, **kwargs) # -- conf: TransformerDecConf = self.conf assert conf.step_dim == 1, "Transformer assumes [bsize, len, D]!!" self.tnodes = [] for i in range(conf.n_layers): one_node = _OneTSFDecNode(conf) self.add_module(f"T{i}", one_node) self.tnodes.append(one_node) # add posi embeddings if conf.use_posi: self.PE = PosiEmbeddingNode(conf.pconf, osize=conf.d_model) # input f if conf.wconf.strategy == "addnorm": self.input_f = LayerNormNode(None, osize=conf.d_model) elif conf.wconf.strategy == "addact": self.input_f = ActivationHelper.get_act(conf.wconf.act) else: zwarn("No calculations for input in TransformerEncoder!!") self.input_f = lambda x: x