Exemplo n.º 1
0
def do_stat(insts):
    cc = Counter()
    voc = SimpleVocab.build_empty()
    for sent in yield_sents(insts):
        cc["sent"] += 1
        cc["tok"] += len(sent)
        cc["tok_pair"] += len(sent)**2
        _tree = sent.tree_dep
        _deplabs = _tree.seq_label.vals
        _slen = len(sent)
        for i0 in range(_slen):
            for i1 in range(_slen):
                if abs(i0 - i1) > 5:
                    continue
                path1, path2 = _tree.get_path(i0, i1)
                labs1, labs2 = sorted(
                    [[_deplabs[z].split(":")[0] for z in path1],
                     [_deplabs[z].split(":")[0] for z in path2]])
                _len = len(labs1) + len(labs2)
                # if _len<=0 or _len>2 or "punct" in labs1 or "punct" in labs2:
                if _len != 2 or "punct" in labs1 or "punct" in labs2:
                    continue
                _k = (tuple(labs1), tuple(labs2))
                voc.feed_one(_k)
    # --
    zlog(cc)
    voc.build_sort()
    d = voc.get_info_table()
    print(d[:100].to_string())
Exemplo n.º 2
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    # input
    with zopen(conf.input) as fd:
        lines = list(fd)
        if conf.skip_blank:
            lines = [z for z in lines if str.isspace(z)]
    # shuffle?
    origin_len = len(lines)
    if conf.shuffle_times > 0 or conf.shuffle:
        _t = max(1, conf.shuffle_times)  # at least once!
        _gen = Random.get_generator('')
        for _ in range(_t):
            _gen.shuffle(lines)
    # sample?
    final_size = int(0.999 + (conf.rate *
                              origin_len if conf.rate <= 1. else conf.rate))
    out_lines = lines[:final_size]
    # output
    if conf.output:
        with zopen(conf.output, 'w') as fd2:
            for line in out_lines:
                fd2.write(line)
    # --
    zlog(
        f"Sample({conf.rate}) {conf.input}=>{conf.output}: {origin_len}=>{len(out_lines)}"
    )
Exemplo n.º 3
0
 def __init__(self, conf: MySRLConf, vocab_evt: SimpleVocab, vocab_arg: SimpleVocab, **kwargs):
     super().__init__(conf, **kwargs)
     conf: MySRLConf = self.conf
     self.vocab_evt = vocab_evt
     self.vocab_arg = vocab_arg
     # --
     self.vocab_bio_arg = None
     self.pred_cons_mat = None
     if conf.arg_use_bio:
         self.vocab_bio_arg = SeqVocab(vocab_arg)  # simply BIO vocab
         zlog(f"Use BIO vocab for srl: {self.vocab_bio_arg}")
         if conf.arg_pred_use_seq_cons:
             _m = self.vocab_bio_arg.get_allowed_transitions()
             self.pred_cons_mat = (1. - BK.input_real(_m)) * Constants.REAL_PRAC_MIN  # [L, L]
             zlog(f"Further use BIO constraints for decoding: {self.pred_cons_mat.shape}")
         helper_vocab_arg = self.vocab_bio_arg
     else:
         helper_vocab_arg = self.vocab_arg
     # --
     self.helper = MySRLHelper(conf, self.vocab_evt, helper_vocab_arg)
     # --
     # predicate
     self.evt_node = SingleIdecNode(conf.evt_conf, ndim=conf.isize, nlab=(2 if conf.binary_evt else len(vocab_evt)))
     # argument
     self.arg_node = PairwiseIdecNode(conf.arg_conf, ndim=conf.isize, nlab=len(helper_vocab_arg))
Exemplo n.º 4
0
def main(output_prefix, *input_files):
    # input
    all_sents = []
    for f in input_files:
        one_reader = ReaderGetterConf().get_reader(input_path=f)
        one_insts = list(one_reader)
        all_sents.append([z for z in yield_sents(one_insts)])
        zlog(f"Read from {f}: {len(all_sents[-1])} sents")
    # align
    sent_map = OrderedDict()
    for fidx, sents in enumerate(all_sents):
        for sent in sents:
            doc_id = sent.info.get("doc_id", "UNK")
            if doc_id.split("/", 1)[0] == "ontonotes":
                doc_id = doc_id.split("/", 1)[1]
            key = doc_id + "|".join(sent.seq_word.vals)  # map by doc_id + key
            if key not in sent_map:
                sent_map[key] = [sent]
            else:
                sent_map[key].append(sent)
    # --
    num_files = len(input_files)
    matched_sents = [vs for vs in sent_map.values() if len(vs) == num_files]
    unmatched_sents = [vs for vs in sent_map.values() if len(vs) != num_files]
    zlog(f"Aligned sent of {len(matched_sents)}")
    breakpoint()
    # output
    for outi in range(num_files):
        out_sents = [z[outi] for z in matched_sents]
        writer = WriterGetterConf().get_writer(
            output_path=f"{output_prefix}{outi}")
        writer.write_insts(out_sents)
        writer.close()
Exemplo n.º 5
0
 def __init__(self,
              conf: PlainInputEmbedderConf,
              voc: SimpleVocab,
              npvec: np.ndarray = None,
              name="UNK"):
     super().__init__(conf, name)
     # --
     conf: PlainInputEmbedderConf = self.conf
     self.voc = voc
     # check init embeddings
     if conf.init_from_pretrain:
         zlog(
             f"Try to init {self.extra_repr()} with npvec.shape={npvec.shape if (npvec is not None) else None}"
         )
         if npvec is None:
             zwarn("warning: cannot get pre-trained embeddings to init!!")
     # get rare unk range
     voc_rare_unk_mask = []
     for w in self.voc.full_i2w:
         c = self.voc.word2count(w, df=None)
         voc_rare_unk_mask.append(
             float(c is not None and c <= conf.rare_unk_thr))
     self.rare_unk_mask = BK.input_real(voc_rare_unk_mask)  # stored tensor!
     # self.register_buffer()  # todo(note): do we need register buffer?
     self.use_rare_unk = (conf.rare_unk_rate > 0. and conf.rare_unk_thr > 0)
     # add the real embedding node
     self.E = EmbeddingNode(conf.econf,
                            npvec=npvec,
                            osize=conf.dim,
                            n_words=len(self.voc))
Exemplo n.º 6
0
 def __init__(self, conf: ZDecoderSRLConf, name: str,
              vocab_evt: SimpleVocab, vocab_arg: SimpleVocab, ref_enc: ZEncoder, **kwargs):
     super().__init__(conf, name, **kwargs)
     conf: ZDecoderSRLConf = self.conf
     self.vocab_evt = vocab_evt
     self.vocab_arg = vocab_arg
     _enc_dim, _head_dim = ref_enc.get_enc_dim(), ref_enc.get_head_dim()
     # --
     self.vocab_bio_arg = None
     self.pred_cons_mat = None
     if conf.arg_use_bio:
         self.vocab_bio_arg = SeqVocab(vocab_arg)  # simply BIO vocab
         zlog(f"Use BIO vocab for srl: {self.vocab_bio_arg}")
         if conf.arg_pred_use_seq_cons:
             _m = self.vocab_bio_arg.get_allowed_transitions()
             self.pred_cons_mat = (1. - BK.input_real(_m)) * Constants.REAL_PRAC_MIN  # [L, L]
             zlog(f"Further use BIO constraints for decoding: {self.pred_cons_mat.shape}")
         helper_vocab_arg = self.vocab_bio_arg
     else:
         helper_vocab_arg = self.vocab_arg
     # --
     self.helper = ZDecoderSRLHelper(conf, self.vocab_evt, helper_vocab_arg, self.vocab_arg)
     # --
     # nodes
     self.evt_node: IdecNode = conf.evt_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=(2 if conf.binary_evt else len(vocab_evt)))
     self.arg_node: IdecNode = conf.arg_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=len(helper_vocab_arg))
     self.arg2_node: IdecNode = conf.arg2_conf.make_node(_isize=_enc_dim, _nhead=_head_dim, _csize=len(self.vocab_arg))
     # --
     raise RuntimeError("Deprecated after MED's collecting of scores!!")
Exemplo n.º 7
0
 def run_test_dataset(self, dataset: ZDataset):
     test_recorder = StatRecorder(timing=True)
     for ibatch in dataset.yield_batches(loop=False):
         with test_recorder.go():
             one_res = self.model.predict_on_batch(ibatch)
             test_recorder.record(one_res)
     # --
     # write output
     if self.is_main_process:  # note: do saving only with main process
         dataset.write_insts(None)  # use dataset's conf
     # --
     # eval
     x = test_recorder.summary()
     zlog(f"Test-Info: {OtherHelper.printd_str(x, sep=' ')}")
     aggr = ResultAggregator()
     for task in self.t_center.tasks.values():
         if task.name not in dataset.tasks:
             continue
         tn_res: ResultRecord = task.eval_insts(dataset.gold_insts,
                                                dataset.insts,
                                                quite=False)
         if tn_res is None:
             continue
         aggr.add(task.name, tn_res, task.conf.eval_weight)
     ret = aggr.get_res()
     return ret
Exemplo n.º 8
0
 def adjust_at_ckp(self, sname: str, obj: object, extra_info: str = ""):
     the_idx = getattr(obj, self.sv_conf.which_idx)
     old_val, new_val = self._set(the_idx)
     if self.cur_val != old_val:
         zlog(f"Change scheduled value {self.name}({extra_info}) at {sname}: {old_val} => {self.cur_val}.")
     else:
         zlog(f"Keep scheduled value {self.name}({extra_info}) at {sname} as {self.cur_val}.")
Exemplo n.º 9
0
 def __init__(self, corpus):
     from sklearn.feature_extraction.text import TfidfVectorizer
     vectorizer = TfidfVectorizer()
     X = vectorizer.fit_transform(corpus)
     self.vectorizer = vectorizer
     self.X = X  # [n_sample, n_feat]<Sparse>
     zlog(f"Build SimpleTfIdfSearcher of {X.shape}")
Exemplo n.º 10
0
 def __init__(self, conf: UDAnalyzerConf):
     super().__init__(conf)
     conf: UDAnalyzerConf = self.conf
     # --
     # read main files
     main_insts = list(conf.main.get_reader(input_path=conf.gold))
     self.set_var("main", main_insts, explanation="init")
     # eval
     self.evaler = DparEvaler(conf.econf)
     # --
     all_sents = [list(yield_sents(main_insts))]
     all_toks = [[t for s in yield_sents(main_insts) for t in s.tokens]]
     for one_pidx, one_pred in enumerate(conf.preds):
         one_insts = list(conf.extra.get_reader(input_path=one_pred))  # get all of them
         one_sents = list(yield_sents(one_insts))
         assert len(one_sents) == len(all_sents[0])
         # eval
         eres = self.evaler.eval(main_insts, one_insts)
         zlog(f"#=====\nEval with {conf.main} vs. {one_pred}: res = {eres}\n{eres.get_detailed_str()}")
         # --
         all_sents.append(one_sents)
         all_toks.append([t for s in one_sents for t in s.tokens])
     # --
     s_lists = [MatchedList(z) for z in zip(*all_sents)]
     self.set_var("sl", s_lists, explanation="init")  # sent pair
     s_toks = [MatchedList(z) for z in zip(*all_toks)]
     self.set_var("tl", s_toks, explanation="init")  # token pair
Exemplo n.º 11
0
 def delete(self, wid: int):
     self.lock.acquire()
     if wid >= 0 and wid < len(self.workers):
         self.workers[wid].mark_delete()
     else:
         zlog(f"Err in WPool: invalid worker-id {wid}.")
     self.lock.release()
Exemplo n.º 12
0
 def load_vocabs(self, v_dir: str, quiet=False):
     info = OrderedDict()
     for n, t in self.tasks.items():
         info[n] = t.load_vocab(v_dir)
     if not quiet:
         zlog(f"Load vocabs from {v_dir}, success={info}")
     return info
Exemplo n.º 13
0
def main(args):
    conf: MainConf = init_everything(MainConf(), args, add_nn=False)
    zlog(f"** Run with {conf.to_json()}")
    # --
    if conf.input_table_file:
        with zopen(conf.input_table_file) as fd:
            s = fd.read()
            table = eval(s)
            mm = table[conf.name]  # read the table from file
    else:
        mm = globals()[conf.name]
    # --
    workers = Worker.get_gpu_workers([int(z) for z in conf.gpus],
                                     ncore=int(conf.ncore))
    # --
    x = TuneDriver(conf.tune_conf, workers, conf.name, extra_info=mm)
    task_iter = [
        MyTask(gid, conf.name, sel_idxes,
               [a[i] for a, i in zip(mm, sel_idxes)], conf.task_conf)
        for gid, sel_idxes in enumerate(
            iter_arg_choices(mm,
                             repeat=conf.repeat,
                             shuffle=conf.shuffle,
                             max_num=conf.max_count))
    ]
    x.main(task_iter)
Exemplo n.º 14
0
 def _run_end(self):
     x = self.test_recorder.summary()
     zlog(f"Test-Info: {OtherHelper.printd_str(x, sep=' ')}")
     res = self._go_end()  # use eval results instead!
     res.results['info'] = x  # add the infos
     self.test_recorder.reset()  # reset recorder!
     return res
Exemplo n.º 15
0
 def build_vocabs(self,
                  d_center: DataCenter,
                  try_load_vdir=None,
                  save_vdir=None):
     # first try load vocabs
     if try_load_vdir is not None:
         load_info = self.load_vocabs(try_load_vdir, quiet=True)
     else:
         load_info = OrderedDict()
     load_names = [k for k, v in load_info.items() if v]
     # then build for those not loaded!
     build_names = []
     for n, t in self.tasks.items():
         if not load_info.get(n, False):  # if not loaded!
             t_datasets = d_center.get_datasets(
                 task=n)  # obtain by task name!!
             # --
             assert t.vpack is None
             t.vpack = t.build_vocab(t_datasets)
             # --
             build_names.append(n)
             if save_vdir is not None:
                 t.save_vocab(save_vdir)
     zlog(
         f"Build vocabs: load {load_names} from {try_load_vdir}, build {build_names}"
     )
Exemplo n.º 16
0
 def _proj_grads(self, flattened_grads):
     _shuffle = self.conf.shuffle_losses
     if _shuffle:
         _gen = Random.get_generator('loss')
     _rates = self.conflicting_change_rates
     # --
     all_g = []
     for i, cur_g in enumerate(flattened_grads):
         new_g = cur_g.clone()
         other_idxes = list(range(len(flattened_grads)))
         if _shuffle:
             _gen.shuffle(other_idxes)
         for j in other_idxes:
             other_g = flattened_grads[j]
             rate = _rates[i][j]
             if rate > 0.:
                 _dot = (new_g * other_g).sum()
                 _other_s2 = (other_g * other_g).sum()
                 _offset = (_dot / _other_s2) * other_g
                 new_g.sub_(rate * ((_dot < 0).float() * _offset))
                 # -- just checking!
                 if BK.get_value(_dot).item() < 0:
                     zlog(
                         f"Here! _dot<0 as _dot={_dot}, _off={_dot / _other_s2}"
                     )
                 # --
         all_g.append(new_g)
     ret = BK.stack(all_g, 0).sum(0)  # [*]
     return ret
Exemplo n.º 17
0
 def aug_vocab_and_arr(main_vocab, main_arr, aug_vocab, aug_arr,
                       aug_override: bool):
     # first merge the vocab
     new_vocab = SimpleVocab.merge_simple_vocabs(
         [main_vocab, aug_vocab],
         sorting=False,
         pre_list=main_vocab.pre_list,
         post_list=main_vocab.post_list)
     # then find the arrays
     # todo(+1): which order to find words
     assert aug_override, "To be implemented for other ordering!"
     # --
     new_arr = [main_arr[i] for i in range(len(main_vocab.pre_list))]
     main_hit = aug_hit = 0
     for idx in range(*(new_vocab.non_special_range())):
         word = new_vocab.idx2word(idx)
         # todo(warn): selecting the embeds in aug first (make it possible to override original ones!)
         aug_orig_idx = aug_vocab.get(word)
         if aug_orig_idx is None:
             main_orig_idx = main_vocab[word]  # must be there!
             new_arr.append(main_arr[main_orig_idx])
             main_hit += 1
         else:
             new_arr.append(aug_arr[aug_orig_idx])
             aug_hit += 1
     new_arr.extend(
         [main_arr[i] for i in range(-len(main_vocab.post_list), 0)])
     # --
     zlog(
         f"For the final merged arr, the composition is all={len(new_arr)},main={main_hit},aug={aug_hit}"
     )
     ret_arr = np.asarray(new_arr)
     return new_vocab, ret_arr
Exemplo n.º 18
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf, specified_wset=["test"])
    # load vocab
    t_center.load_vocabs(t_center.conf.vocab_load_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    model.finish_sr()  # note: build sr before possible loading in testing!!
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.model_load_name != "":
        r_center.load(conf.rconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    res = r_center.do_test()
    zlog(f"zzzztestfinal: {res}")
    # --
    zlog("The end of Testing.")
Exemplo n.º 19
0
 def do_ae(self, pi: int, ss: str):
     pi = int(pi)
     einfo = self.cur_obj.err_infos[pi]
     old_explains = einfo.get_explains()
     einfo.set_explains(ss)
     new_explains = einfo.get_explains()
     zlog(f"Change #{pi} from {old_explains} to {new_explains}")
Exemplo n.º 20
0
 def pp(self, method: str, printing=False, **kwargs):
     from .helper2 import MyPrettyPrinter
     ff = getattr(MyPrettyPrinter, method)
     ss = ff(self, **kwargs)
     if printing:
         zlog(ss)
     return ss
Exemplo n.º 21
0
 def _next(self):
     ret = self.eos  # by default, EOS if no one can return anything
     cur_idx, cur_ratio = self._cur_idx, self._cur_ratio
     # at most travel one round + 1
     for _ in range(self._num_streamers + 1):
         flag_ok = False
         if cur_ratio <= 0.:
             pass  # no budget
         elif cur_ratio < 1. and next(self._random_sampler) >= cur_ratio:
             pass  # not hit with random ratio
         else:
             # try to get one
             cur_streamer = self._base_streamers[cur_idx]
             one, _iseos = cur_streamer.next_and_check()
             if _iseos:
                 if self._stop_sidx == cur_idx:
                     if self.verbose:
                         zlog(f"End for one, count = {self._cur_counts}",
                              func='report')
                     return self.eos  # directly EOS since SIDX returns EOS
             else:
                 self._cur_counts[cur_idx] += 1  # record
                 ret = one
                 flag_ok = True
         # --
         if flag_ok:
             self._cur_idx, self._cur_ratio = cur_idx, cur_ratio - 1  # cost one for the current step
             break
         else:  # forward one streamer
             cur_idx = (cur_idx + 1) % self._num_streamers
             cur_ratio = float(self._ratios[cur_idx])
     return ret
Exemplo n.º 22
0
def iter_arg_choices(m: List, repeat=True, shuffle=True, max_num=-1):
    _gen = Random.get_generator("tune")
    # --
    idx = 0
    # expand fully
    args_pool = None
    if not repeat:
        args_pool = [[]]
        for cur_items in m:
            new_pool = []
            for a in args_pool:
                for one_idx in range(len(cur_items)):
                    new_pool.append(a + [one_idx])
            args_pool = new_pool
        # --
        zlog("** Arrange non-repeat iter, sized %d." % len(args_pool))
        if shuffle:
            for _ in range(10):
                _gen.shuffle(args_pool)
        else:
            args_pool.reverse()  # later using pop
    while True:
        if idx == max_num:
            break
        if repeat:
            sel_idxes = [_gen.randint(len(one)) for one in m]
        else:
            if len(args_pool) > 0:
                sel_idxes = args_pool.pop()
            else:
                break
        # -----
        yield sel_idxes  # return selection idxes
        idx += 1
Exemplo n.º 23
0
 def __init__(self,
              conf: BertEncoderConf,
              other_embed_nodes: Dict[str, EmbeddingNode] = None,
              other_embed_borrowing=True):
     super().__init__(conf)
     # --
     conf: BertEncoderConf = self.conf
     zlog(
         f"Loading pre-trained bert model for Berter2 of {self.extra_repr()}"
     )
     # make a impl and add module
     self.impl = BerterImpl.create(conf.bert_model,
                                   cache_dir=conf.cache_dir_or_none)
     if conf.bert_ft:  # fine-tune it!
         self.add_module("M", ModuleWrapper(self.impl.model,
                                            None))  # reg it!
     zlog(f"Load ok, move to default device {BK.DEFAULT_DEVICE}")
     self.impl.model.to(BK.DEFAULT_DEVICE)
     # --
     # other embedding node (by default just borrowing)
     self.other_embed_nodes = other_embed_nodes
     if not other_embed_borrowing and other_embed_nodes is not None:  # include the modules
         for _k, _n in other_embed_nodes.items():
             self.add_module(f"_N{_k}", _n)
     # output combiner
     self.combiner = CombinerNode(conf.bert_combiner,
                                  isizes=[self.impl.hidden_size] *
                                  len(conf.bert_output_layers))
     # get max layer
     _nl = self.impl.num_hidden_layers  # 0 is emb layer!!
     self.actual_output_layers = [
         z if z >= 0 else (_nl + 1 + z) for z in conf.bert_output_layers
     ]
Exemplo n.º 24
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    stat = Counter()
    if conf.domain_f in DOMAIN_FS:
        domain_f = DOMAIN_FS[conf.domain_f]
    else:
        domain_f = eval(conf.domain_f)
    # --
    all_insts = {}  # ID->List[inst]
    with zopen(conf.input) as fin:
        for line in fin:
            inst = json.loads(line)
            domain = domain_f(inst)
            if domain not in all_insts:
                all_insts[domain] = []
            all_insts[domain].append(inst)
            stat["inst"] += 1
            stat[f"inst_{domain}"] += 1
    # --
    # write
    input_name = os.path.basename(conf.input)
    for domain, insts in all_insts.items():
        output_name_fields = input_name.split(".")
        output_name_fields.insert(conf.output_insert_place, domain)
        output_name = os.path.join(conf.output_dir,
                                   ".".join(output_name_fields))
        zlog(f"Write to {output_name} {len(insts)}")
        with zopen(output_name, 'w') as fout:
            default_json_serializer.save_iter(insts, fout)
    # --
    zlog(f"Read from {fin}, stat=\n{OtherHelper.printd_str(stat)}")
Exemplo n.º 25
0
 def do_ap(self, **kwargs):
     cur_obj_info = self.obj_info(self.cur_obj, **kwargs)
     ret = ""
     if cur_obj_info is not None:
         ret += cur_obj_info + "\n"
     ret += self.cur_status
     zlog(ret)
Exemplo n.º 26
0
 def loss_on_batch(self,
                   annotated_insts: List,
                   loss_factor=1.,
                   training=True,
                   **kwargs):
     self.refresh_batch(training)
     # --
     sents: List[Sent] = list(yield_sents(annotated_insts))
     # ==
     # extend to events
     import numpy as np
     bsize = sum(len(z.events) for z in sents)
     mlen = max(len(z) for z in sents)
     arr_preds = np.full([bsize, mlen], 0., dtype=np.int32)
     arr_inputs = np.full([bsize, mlen], b'<pad>', dtype=object)
     arr_labels = np.full([bsize, mlen], b'<pad>', dtype=object)
     ii = 0
     for sent in sents:
         for evt in sent.events:
             widx, wlen = evt.mention.get_span()
             assert wlen == 1
             # --
             arr_preds[ii, widx] = 1
             arr_inputs[ii, :len(sent)] = [
                 s.lower().encode() for s in sent.seq_word.vals
             ]
             # --
             tmp_labels = ["O"] * len(sent)
             for arg in evt.args:
                 role = arg.role
                 a_widx, a_wlen = arg.arg.mention.get_span()
                 a_labs = ["B-" + role] + ["I-" + role] * (a_wlen - 1)
                 assert all(z == "O"
                            for z in tmp_labels[a_widx:a_widx + a_wlen])
                 tmp_labels[a_widx:a_widx + a_wlen] = a_labs
             # --
             arr_labels[ii, :len(sent)] = [z.encode() for z in tmp_labels]
             # --
             ii += 1
     assert ii == bsize
     features, labels = data.lookup(({
         "preds": NpWarapper(arr_preds),
         "inputs": NpWarapper(arr_inputs)
     }, NpWarapper(arr_labels)), "train", self.params)
     # ==
     final_loss = self.M(features, labels)
     info = {
         "inst": len(annotated_insts),
         "sent": len(sents),
         "fb": 1,
         "loss": final_loss.item()
     }
     if training:
         assert final_loss.requires_grad
         BK.backward(final_loss, loss_factor)
     zlog(
         f"batch shape = {len(annotated_insts)} {bsize} {mlen} {bsize*mlen}"
     )
     return info
Exemplo n.º 27
0
 def __init__(self, conf: SRLAnalyzerConf):
     super().__init__(conf)
     conf: SRLAnalyzerConf = self.conf
     self.err_map = ErrDetail.ERR_MAPS[conf.err_map]
     # --
     if conf.pre_load:
         self.do_load(conf.pre_load)
     else:
         # further analyze the arguments
         num_pred = len(conf.preds)
         f_lists = self.get_var("fl")  # get frame matches
         f_all_correct_list = []
         f_some_wrong_list = []
         for fl in f_lists:
             gold_frame = fl.gold
             pred_frames = fl.preds
             assert len(pred_frames) == num_pred
             # get them all
             self._process_args(gold_frame)
             err_infos = []
             for pf in pred_frames:
                 self._process_args(pf)  # sort args
                 einfo = ErrInfo.create(gold_frame, pf)
                 err_infos.append(einfo)
             fl.err_infos = err_infos
             # --
             if all(e.fully_correct() for e in err_infos):
                 f_all_correct_list.append(fl)
             else:
                 f_some_wrong_list.append(fl)
         self.set_var("fl1", f_all_correct_list,
                      explanation="init")  # eval pair
         self.set_var("fl0", f_some_wrong_list,
                      explanation="init")  # eval pair
         zlog(
             f"All frames = {len(f_lists)}, all_corr = {len(f_all_correct_list)}({len(f_all_correct_list)/max(1, len(f_lists))})"
         )
         # --
         # breakdowns for all
         for pi in range(num_pred):
             one_err_infos = [
                 e for fl in f_lists for e in fl.err_infos[pi].rps
             ]
             self.set_var(f"eip{pi}", one_err_infos)
             # self.do_group(f"eip{pi}", "d.get_signature('etype', 'etype2', emap=self.err_map)")
             self.do_group(f"eip{pi}",
                           "d.get_signature('etype', emap=self.err_map)")
             self.do_group(f"eip{pi}", "d.get_signature('etype2')")
             # group eip0 "d.get_signature('etype2')"
             # fg eip0 "d.get_signature('explain')!='_'" "d.get_signature('explain')"
         # --
         # get ps objects
         # self.set_var("dps100", self._get_dpath_objects(f_lists, 100))
     # --
     # load vocab
     if conf.load_vocab:
         from msp2.utils import default_pickle_serializer
         self.vocabs, _ = default_pickle_serializer.from_file(
             conf.load_vocab)
Exemplo n.º 28
0
 def __init__(self, conf: PairScorerConf, **kwargs):
     super().__init__(conf, **kwargs)
     conf: PairScorerConf = self.conf
     # --
     # rs
     self.rs = RScorerNode(conf.rs)
     self.apply_osize = conf.osize * self.rs.piece
     # --
     self.use_input_flags = [
         conf.use_input0, conf.use_input1, conf.use_input_pair
     ]  # use what in ff1 and ff2
     self.input_sizes = [conf.isize0, conf.isize1, conf.isizeP]
     self.input_sizes_valid = [
         s for s, f in zip(self.input_sizes, self.use_input_flags) if f
     ]
     # --
     # add components
     assert conf.use_ff1 or conf.use_ff2 or conf.use_biaffine, "No real calculations here!"
     # bias
     if conf.use_bias:
         self.B = BK.new_param([self.apply_osize])
     # ff1
     if conf.use_ff1:
         self.FF1s = []
         for one_size, one_flag in zip(self.input_sizes,
                                       self.use_input_flags):
             if one_flag:
                 one_node = get_mlp(
                     one_size, self.apply_osize, conf.ff1_hid_size,
                     conf.ff1_hid_layer,
                     AffineConf().direct_update(out_act=conf.ff1_hid_act),
                     AffineConf().direct_update(no_drop=True,
                                                use_bias=False))
                 self.add_module(f"FF1_{len(self.FF1s)}", one_node)
             else:
                 one_node = lambda x: 0.
             self.FF1s.append(one_node)
     # ff2
     if conf.use_ff2:
         self.FF2 = get_mlp(
             self.input_sizes_valid, self.apply_osize, conf.ff2_hid_size,
             conf.ff2_hid_layer,
             AffineConf().direct_update(out_act=conf.ff2_hid_act,
                                        which_affine=3),
             AffineConf().direct_update(no_drop=True, use_bias=False))
     # biaffine
     if conf.use_biaffine:
         # this is different than BK.bilinear or layers.BiAffine
         self.BW = BK.new_param(
             [conf.isize0, conf.isize1 * self.apply_osize])
         if conf.biaffine_div <= 0.:
             conf.biaffine_div = (conf.isize0 *
                                  conf.isize1)**0.25  # sqrt(sqrt(in1*in2))
             zlog(
                 f"Adopt biaffine_div of {conf.biaffine_div} for the current PairScorer!"
             )
     # todo(note): no dropout at output, add it if needed!
     # --
     self.reset_parameters()
Exemplo n.º 29
0
def main(args):
    conf, model, vpack, test_iter = prepare_test(args)
    dconf = conf.dconf
    # go
    rr = ZsfpTestingRunner(model, test_iter, conf, dconf.output, dconf.test)
    res = rr.run()
    zlog(f"zzzfinal: {res}")
    zlog("The end of testing.")
Exemplo n.º 30
0
 def set_var(self, target: str, v: object, explanation=None):
     if hasattr(self.vars, target):
         zlog(f"Overwriting the existing var `{target}'")
     if target not in self.traces:
         self.traces[target] = []
     # (explanation, history-idx)
     self.traces[target].append((explanation, len(self.history)))
     setattr(self.vars, target, v)